From 991356badd8c217bb82b161275b907316881b35d Mon Sep 17 00:00:00 2001
From: Kai Hu <kaiorhu@gmail.com>
Date: Tue, 14 Feb 2023 21:56:20 -0500
Subject: [PATCH 01/36] [Doc] Add a demo for spatial-temporal detection PyTorch
 to ONNX (#2225)

---
 demo/README.md                        |  69 +++++
 demo/demo_spatiotemporal_det_onnx.py  | 382 ++++++++++++++++++++++++++
 tools/deployment/export_onnx_stdet.py |   4 +-
 3 files changed, 453 insertions(+), 2 deletions(-)
 create mode 100644 demo/demo_spatiotemporal_det_onnx.py

diff --git a/demo/README.md b/demo/README.md
index f3f4ba1db9..3252da35be 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -309,6 +309,75 @@ python demo/demo_spatiotemporal_det.py demo/demo.mp4 demo/demo_spatiotemporal_de
     --output-fps 6
 ```
 
+## SpatioTemporal Action Detection ONNX Video Demo
+
+MMAction2 provides a demo script to predict the SpatioTemporal Action Detection result using the onnx file instead of building the PyTorch models.
+
+```shell
+python demo/demo_spatiotemporal_det_onnx.py --video ${VIDEO_FILE} \
+    [--out-filename ${OUTPUT_FILENAME}] \
+    [--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
+    [--onnx-file ${SPATIOTEMPORAL_ACTION_DETECTION_ONNX_FILE}] \
+    [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
+    [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
+    [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
+    [--det-cat-id ${HUMAN_DETECTION_CATEGORY_ID}] \
+    [--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \
+    [--label-map ${LABEL_MAP}] \
+    [--device ${DEVICE}] \
+    [--short-side] ${SHORT_SIDE} \
+    [--predict-stepsize ${PREDICT_STEPSIZE}] \
+    [--output-stepsize ${OUTPUT_STEPSIZE}] \
+    [--output-fps ${OUTPUT_FPS}]
+```
+
+Optional arguments:
+
+- `OUTPUT_FILENAME`: Path to the output file which is a video format. Defaults to `demo/stdet_demo.mp4`.
+- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path.
+- `SPATIOTEMPORAL_ACTION_DETECTION_ONNX_FILE`: The spatiotemporal action detection onnx file.
+- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
+- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
+- `HUMAN_DETECTION_SCORE_THRESHOLD`: The score threshold for human detection. Defaults to 0.9.
+- `HUMAN_DETECTION_CATEGORY_ID`: The category id for human detection. Defaults to 0.
+- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Defaults to 0.5.
+- `LABEL_MAP`: The label map used. Defaults to `tools/data/ava/label_map.txt`.
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`.  Defaults to `cuda:0`.
+- `SHORT_SIDE`: The short side used for frame extraction. Defaults to 256.
+- `PREDICT_STEPSIZE`: Make a prediction per N frames.  Defaults to 8.
+- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Defaults to 4.
+- `OUTPUT_FPS`: The FPS of demo video output. Defaults to 6.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` .
+
+1. Export an onnx file given the config file and checkpoint.
+
+```shell
+python3 tools/deployment/export_onnx_stdet.py \
+    configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
+    https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
+    --output_file slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx \
+    --num_frames 8
+```
+
+2. Use the Faster RCNN as the human detector, the generated `slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx` file as the action detector. Making predictions per 8 frames, and output 1 frame per 4 frames to the output video. The FPS of the output video is 4.
+
+```shell
+python demo/demo_spatiotemporal_det_onnx.py demo/demo.mp4 demo/demo_spatiotemporal_det.mp4 \
+    --config configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
+    --onnx-file slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx \
+    --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
+    --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+    --det-score-thr 0.9 \
+    --action-score-thr 0.5 \
+    --label-map tools/data/ava/label_map.txt \
+    --predict-stepsize 8 \
+    --output-stepsize 4 \
+    --output-fps 6
+```
+
 ## Inferencer
 
 MMAction2 provides a demo script to implement fast prediction for video analysis tasks based on unified inferencer interface, currently only supports action recognition task.
diff --git a/demo/demo_spatiotemporal_det_onnx.py b/demo/demo_spatiotemporal_det_onnx.py
new file mode 100644
index 0000000000..d1ee9f0edc
--- /dev/null
+++ b/demo/demo_spatiotemporal_det_onnx.py
@@ -0,0 +1,382 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy as cp
+import os
+import os.path as osp
+import shutil
+
+import cv2
+import mmcv
+import mmengine
+import numpy as np
+import onnxruntime
+import torch
+from mmdet.structures.bbox import bbox2roi
+from mmengine import DictAction
+
+from mmaction.apis import detection_inference
+
+try:
+    import moviepy.editor as mpy
+except ImportError:
+    raise ImportError('Please install moviepy to enable output file')
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.5
+FONTCOLOR = (255, 255, 255)  # BGR, white
+MSGCOLOR = (128, 128, 128)  # BGR, gray
+THICKNESS = 1
+LINETYPE = 1
+
+
+def hex2color(h):
+    """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
+    return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
+
+
+plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
+plate_blue = plate_blue.split('-')
+plate_blue = [hex2color(h) for h in plate_blue]
+plate_green = '004b23-006400-007200-008000-38b000-70e000'
+plate_green = plate_green.split('-')
+plate_green = [hex2color(h) for h in plate_green]
+
+
+def visualize(frames, annotations, plate=plate_blue, max_num=5):
+    """Visualize frames with predicted annotations.
+
+    Args:
+        frames (list[np.ndarray]): Frames for visualization, note that
+            len(frames) % len(annotations) should be 0.
+        annotations (list[list[tuple]]): The predicted results.
+        plate (str): The plate used for visualization. Default: plate_blue.
+        max_num (int): Max number of labels to visualize for a person box.
+            Default: 5.
+    Returns:
+        list[np.ndarray]: Visualized frames.
+    """
+
+    assert max_num + 1 <= len(plate)
+    plate = [x[::-1] for x in plate]
+    frames_out = cp.deepcopy(frames)
+    nf, na = len(frames), len(annotations)
+    assert nf % na == 0
+    nfpa = len(frames) // len(annotations)
+    anno = None
+    h, w, _ = frames[0].shape
+    scale_ratio = np.array([w, h, w, h])
+    for i in range(na):
+        anno = annotations[i]
+        if anno is None:
+            continue
+        for j in range(nfpa):
+            ind = i * nfpa + j
+            frame = frames_out[ind]
+            for ann in anno:
+                box = ann[0]
+                label = ann[1]
+                if not len(label):
+                    continue
+                score = ann[2]
+                box = (box * scale_ratio).astype(np.int64)
+                st, ed = tuple(box[:2]), tuple(box[2:])
+                cv2.rectangle(frame, st, ed, plate[0], 2)
+                for k, lb in enumerate(label):
+                    if k >= max_num:
+                        break
+                    text = abbrev(lb)
+                    text = ': '.join([text, str(score[k])])
+                    location = (0 + st[0], 18 + k * 18 + st[1])
+                    textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
+                                               THICKNESS)[0]
+                    textwidth = textsize[0]
+                    diag0 = (location[0] + textwidth, location[1] - 14)
+                    diag1 = (location[0], location[1] + 2)
+                    cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
+                    cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+                                FONTCOLOR, THICKNESS, LINETYPE)
+
+    return frames_out
+
+
+def frame_extraction(video_path):
+    """Extract frames given video_path.
+
+    Args:
+        video_path (str): The video_path.
+    """
+    # Load the video, extract frames into ./tmp/video_name
+    target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
+    os.makedirs(target_dir, exist_ok=True)
+    # Should be able to handle videos up to several hours
+    frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
+    vid = cv2.VideoCapture(video_path)
+    frames = []
+    frame_paths = []
+    flag, frame = vid.read()
+    cnt = 0
+    while flag:
+        frames.append(frame)
+        frame_path = frame_tmpl.format(cnt + 1)
+        frame_paths.append(frame_path)
+        cv2.imwrite(frame_path, frame)
+        cnt += 1
+        flag, frame = vid.read()
+    return frame_paths, frames
+
+
+def load_label_map(file_path):
+    """Load Label Map.
+
+    Args:
+        file_path (str): The file path of label map.
+    Returns:
+        dict: The label map (int -> label name).
+    """
+    lines = open(file_path).readlines()
+    lines = [x.strip().split(': ') for x in lines]
+    return {int(x[0]): x[1] for x in lines}
+
+
+def abbrev(name):
+    """Get the abbreviation of label name:
+
+    'take (an object) from (a person)' -> 'take ... from ...'
+    """
+    while name.find('(') != -1:
+        st, ed = name.find('('), name.find(')')
+        name = name[:st] + '...' + name[ed + 1:]
+    return name
+
+
+def pack_result(human_detection, result, img_h, img_w):
+    """Short summary.
+
+    Args:
+        human_detection (np.ndarray): Human detection result.
+        result (type): The predicted label of each human proposal.
+        img_h (int): The image height.
+        img_w (int): The image width.
+    Returns:
+        tuple: Tuple of human proposal, label name and label score.
+    """
+    human_detection[:, 0::2] /= img_w
+    human_detection[:, 1::2] /= img_h
+    results = []
+    if result is None:
+        return None
+    for prop, res in zip(human_detection, result):
+        res.sort(key=lambda x: -x[1])
+        results.append(
+            (prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
+                                                            for x in res]))
+    return results
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMAction2 demo')
+    parser.add_argument('video', help='video file/url')
+    parser.add_argument('out_filename', help='output filename')
+    parser.add_argument(
+        '--config',
+        default=('configs/detection/ava_kinetics/slowonly_k700-pre-'
+                 'r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'),
+        help='spatialtemporal detection model config file path')
+    parser.add_argument(
+        '--onnx-file', help='spatialtemporal detection onnx file path')
+
+    parser.add_argument(
+        '--det-config',
+        default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py',
+        help='human detection config file path (from mmdet)')
+    parser.add_argument(
+        '--det-checkpoint',
+        default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+                 'faster_rcnn_r50_fpn_2x_coco/'
+                 'faster_rcnn_r50_fpn_2x_coco_'
+                 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
+        help='human detection checkpoint file/url')
+    parser.add_argument(
+        '--det-score-thr',
+        type=float,
+        default=0.9,
+        help='the threshold of human detection score')
+    parser.add_argument(
+        '--det-cat-id',
+        type=int,
+        default=0,
+        help='the category id for human detection')
+    parser.add_argument(
+        '--action-score-thr',
+        type=float,
+        default=0.5,
+        help='the threshold of human action score')
+    parser.add_argument(
+        '--label-map',
+        default='tools/data/ava/label_map.txt',
+        help='label map file')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+    parser.add_argument(
+        '--short-side',
+        type=int,
+        default=256,
+        help='specify the short-side length of the image')
+    parser.add_argument(
+        '--predict-stepsize',
+        default=8,
+        type=int,
+        help='give out a prediction per n frames')
+    parser.add_argument(
+        '--output-stepsize',
+        default=4,
+        type=int,
+        help=('show one frame per n frames in the demo, we should have: '
+              'predict_stepsize % output_stepsize == 0'))
+    parser.add_argument(
+        '--output-fps',
+        default=6,
+        type=int,
+        help='the fps of demo video output')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    frame_paths, original_frames = frame_extraction(args.video)
+    num_frame = len(frame_paths)
+    h, w, _ = original_frames[0].shape
+
+    # resize frames to shortside
+    new_w, new_h = mmcv.rescale_size((w, h), (args.short_side, np.Inf))
+    frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
+    w_ratio, h_ratio = new_w / w, new_h / h
+
+    # Get clip_len, frame_interval and calculate center index of each clip
+    config = mmengine.Config.fromfile(args.config)
+    config.merge_from_dict(args.cfg_options)
+    val_pipeline = config.val_pipeline
+
+    sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
+    clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
+    window_size = clip_len * frame_interval
+    assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+    # Note that it's 1 based here
+    timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
+                           args.predict_stepsize)
+
+    # Load label_map
+    label_map = load_label_map(args.label_map)
+    try:
+        if config['data']['train']['custom_classes'] is not None:
+            label_map = {
+                id + 1: label_map[cls]
+                for id, cls in enumerate(config['data']['train']
+                                         ['custom_classes'])
+            }
+    except KeyError:
+        pass
+
+    # Get Human detection results
+    center_frames = [frame_paths[ind - 1] for ind in timestamps]
+
+    human_detections, _ = detection_inference(args.det_config,
+                                              args.det_checkpoint,
+                                              center_frames,
+                                              args.det_score_thr,
+                                              args.det_cat_id, args.device)
+    torch.cuda.empty_cache()
+    for i in range(len(human_detections)):
+        det = human_detections[i]
+        det[:, 0:4:2] *= w_ratio
+        det[:, 1:4:2] *= h_ratio
+        human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
+
+    # Build STDET model
+    session = onnxruntime.InferenceSession(args.onnx_file)
+
+    predictions = []
+
+    img_norm_cfg = dict(
+        mean=np.array(config.model.data_preprocessor.mean),
+        std=np.array(config.model.data_preprocessor.std),
+        to_rgb=False)
+
+    print('Performing SpatioTemporal Action Detection for each clip')
+    assert len(timestamps) == len(human_detections)
+    prog_bar = mmengine.ProgressBar(len(timestamps))
+    for timestamp, proposal in zip(timestamps, human_detections):
+        if proposal.shape[0] == 0:
+            predictions.append(None)
+            continue
+
+        start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
+        frame_inds = start_frame + np.arange(0, window_size, frame_interval)
+        frame_inds = list(frame_inds - 1)
+        imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
+        _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
+        # THWC -> CTHW -> 1CTHW
+        input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
+        rois = bbox2roi([proposal])
+
+        input_feed = {
+            'input_tensor': input_array,
+            'rois': rois.cpu().data.numpy()
+        }
+        outputs = session.run(['cls_score'], input_feed=input_feed)
+        logits = outputs[0]
+        scores = 1 / (1 + np.exp(-logits))
+
+        prediction = []
+        # N proposals
+        for i in range(proposal.shape[0]):
+            prediction.append([])
+        # Perform action score thr
+        for i in range(scores.shape[1]):
+            if i not in label_map:
+                continue
+            for j in range(proposal.shape[0]):
+                if scores[j, i] > args.action_score_thr:
+                    prediction[j].append((label_map[i], scores[j, i].item()))
+        predictions.append(prediction)
+        prog_bar.update()
+
+    results = []
+    for human_detection, prediction in zip(human_detections, predictions):
+        results.append(pack_result(human_detection, prediction, new_h, new_w))
+
+    def dense_timestamps(timestamps, n):
+        """Make it nx frames."""
+        old_frame_interval = (timestamps[1] - timestamps[0])
+        start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+        new_frame_inds = np.arange(
+            len(timestamps) * n) * old_frame_interval / n + start
+        return new_frame_inds.astype(np.int)
+
+    dense_n = int(args.predict_stepsize / args.output_stepsize)
+    frames = [
+        cv2.imread(frame_paths[i - 1])
+        for i in dense_timestamps(timestamps, dense_n)
+    ]
+    print('Performing visualization')
+    vis_frames = visualize(frames, results)
+    vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+                                fps=args.output_fps)
+    vid.write_videofile(args.out_filename)
+
+    tmp_frame_dir = osp.dirname(frame_paths[0])
+    shutil.rmtree(tmp_frame_dir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/deployment/export_onnx_stdet.py b/tools/deployment/export_onnx_stdet.py
index fc587dbff0..ba0cd2e388 100644
--- a/tools/deployment/export_onnx_stdet.py
+++ b/tools/deployment/export_onnx_stdet.py
@@ -155,9 +155,9 @@ def main():
         args.output_file,
         input_names=['input_tensor', 'rois'],
         output_names=['cls_score'],
-        export_params=False,
+        export_params=True,
         do_constant_folding=True,
-        verbose=True,
+        verbose=False,
         opset_version=11,
         dynamic_axes={
             'input_tensor': {

From e036445f482d22ee9e607be88525e9a67bed5b84 Mon Sep 17 00:00:00 2001
From: vansin <msnode@163.com>
Date: Thu, 16 Feb 2023 17:50:13 +0800
Subject: [PATCH 02/36] Add twitter discord medium youtube link (#2228)

---
 README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/README.md b/README.md
index 5e85da2fd5..ab41e0f96e 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,20 @@
 
 </div>
 
+<div align="center">
+  <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218352562-cdded397-b0f3-4ca1-b8dd-a60df8dca75b.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://discord.com/channels/1037617289144569886/1046608014234370059" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://twitter.com/OpenMMLab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
+</div>
+
 ## Introduction
 
 MMAction2 is an open-source toolbox for video understanding based on PyTorch.

From 933ba42b64cc904c110c7947b90f7a4576ab726d Mon Sep 17 00:00:00 2001
From: Kai Hu <kaiorhu@gmail.com>
Date: Thu, 16 Feb 2023 04:57:28 -0500
Subject: [PATCH 03/36] fix bug (#2227)

---
 tools/data/anno_txt2json.py        | 4 ++--
 tools/data/build_audio_features.py | 4 ++--
 tools/data/build_file_list.py      | 2 +-
 tools/data/extract_audio.py        | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/data/anno_txt2json.py b/tools/data/anno_txt2json.py
index fcefc7778e..f5b1f9f736 100644
--- a/tools/data/anno_txt2json.py
+++ b/tools/data/anno_txt2json.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
 
-import mmcv
+import mmengine
 
 
 def parse_args():
@@ -100,4 +100,4 @@ def lines2dictlist(lines, format):
     result = lines2dictlist(lines, args.format)
     if args.output is None:
         args.output = args.annofile.replace('.txt', '.json')
-    mmcv.dump(result, args.output)
+    mmengine.dump(result, args.output)
diff --git a/tools/data/build_audio_features.py b/tools/data/build_audio_features.py
index 05f5978083..28356a0e64 100644
--- a/tools/data/build_audio_features.py
+++ b/tools/data/build_audio_features.py
@@ -6,7 +6,7 @@
 import sys
 from multiprocessing import Pool
 
-import mmcv
+import mmengine
 import numpy as np
 from scipy.io import wavfile
 
@@ -295,7 +295,7 @@ def extract_audio_feature(wav_path, audio_tools, mel_out_dir):
     parser.add_argument('--part', type=str, default='1/1')
     args = parser.parse_args()
 
-    mmcv.mkdir_or_exist(args.spectrogram_save_path)
+    mmengine.mkdir_or_exist(args.spectrogram_save_path)
 
     files = glob.glob(
         osp.join(args.audio_home_path, '*/' * args.level, '*' + args.ext))
diff --git a/tools/data/build_file_list.py b/tools/data/build_file_list.py
index 0ba15e75d0..11a1322854 100644
--- a/tools/data/build_file_list.py
+++ b/tools/data/build_file_list.py
@@ -5,7 +5,7 @@
 import os.path as osp
 import random
 
-from mmcv.runner import set_random_seed
+from mmengine.runner import set_random_seed
 
 from tools.data.anno_txt2json import lines2dictlist
 from tools.data.parse_file_list import (parse_directory, parse_diving48_splits,
diff --git a/tools/data/extract_audio.py b/tools/data/extract_audio.py
index 6f56de2691..78d95d8ea1 100644
--- a/tools/data/extract_audio.py
+++ b/tools/data/extract_audio.py
@@ -5,7 +5,7 @@
 import os.path as osp
 from multiprocessing import Pool
 
-import mmcv
+import mmengine
 
 
 def extract_audio_wav(line):
@@ -47,7 +47,7 @@ def parse_args():
 if __name__ == '__main__':
     args = parse_args()
 
-    mmcv.mkdir_or_exist(args.dst_root)
+    mmengine.mkdir_or_exist(args.dst_root)
 
     print('Reading videos from folder: ', args.root)
     print('Extension of videos: ', args.ext)

From 09cd0a72e6b89bae5590b1a4f1cfa50b9c9c38bd Mon Sep 17 00:00:00 2001
From: wxDai <daiwenxun@pjlab.org.cn>
Date: Mon, 20 Feb 2023 15:16:40 +0800
Subject: [PATCH 04/36] update skeleton data readme (#2222)

---
 tools/data/skeleton/README.md               | 60 +++++++++------------
 tools/data/skeleton/download_annotations.sh | 22 --------
 2 files changed, 25 insertions(+), 57 deletions(-)
 delete mode 100644 tools/data/skeleton/download_annotations.sh

diff --git a/tools/data/skeleton/README.md b/tools/data/skeleton/README.md
index 25c7f62892..3ada42e8ef 100644
--- a/tools/data/skeleton/README.md
+++ b/tools/data/skeleton/README.md
@@ -15,48 +15,38 @@
 
 ## Introduction
 
-We release the skeleton annotations used in [Revisiting Skeleton-based Action Recognition](https://arxiv.org/abs/2104.13586). By default, we use [Faster-RCNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) with ResNet50 backbone for human detection and [HRNet-w32](https://github.com/open-mmlab/mmpose/blob/master/configs/top_down/hrnet/coco/hrnet_w32_coco_256x192.py) for single person pose estimation. For FineGYM, we use Ground-Truth bounding boxes for the athlete instead of detection bounding boxes. Currently, we release the skeleton annotations for FineGYM and NTURGB-D Xsub split. Other annotations will be soo released.
+We release the skeleton annotations used in [Revisiting Skeleton-based Action Recognition](https://arxiv.org/abs/2104.13586). By default, we use [Faster-RCNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) with ResNet50 backbone for human detection and [HRNet-w32](https://github.com/open-mmlab/mmpose/blob/master/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py) for single person pose estimation. For FineGYM, we use Ground-Truth bounding boxes for the athlete instead of detection bounding boxes.
 
 ## Prepare Annotations
 
-Currently, we support HMDB51, UCF101, FineGYM and NTURGB+D. For FineGYM, you can execute following scripts to prepare the annotations.
+We provide links to the pre-processed skeleton annotations, you can directly download them and use them for training & testing.
 
-```shell
-bash download_annotations.sh ${DATASET}
-```
-
-Due to [Conditions of Use](http://rose1.ntu.edu.sg/Datasets/actionRecognition.asp) of the NTURGB+D dataset, we can not directly release the annotations used in our experiments. So that we provide a script to generate pose annotations for videos in NTURGB+D datasets, which generate a dictionary and save it as a single pickle file. You can create a list which contain all annotation dictionaries of corresponding videos and save them as a pickle file. Then you can get the `ntu60_xsub_train.pkl`, `ntu60_xsub_val.pkl`, `ntu120_xsub_train.pkl`, `ntu120_xsub_val.pkl` that we used in training.
-
-For those who have not enough computations for pose extraction, we provide the outputs of the above pipeline here, corresponding to 4 different splits of NTURGB+D datasets:
-
-- ntu60_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_train.pkl
-- ntu60_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_val.pkl
-- ntu120_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_train.pkl
-- ntu120_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_val.pkl
-- hmdb51: https://download.openmmlab.com/mmaction/posec3d/hmdb51.pkl
-- ucf101: https://download.openmmlab.com/mmaction/posec3d/ucf101.pkl
-
-To generate 2D pose annotations for a single video, first, you need to install mmdetection and mmpose from src code. After that, you need to replace the placeholder `mmdet_root` and `mmpose_root` in `ntu_pose_extraction.py` with your installation path. Then you can use following scripts for NTURGB+D video pose extraction:
-
-```python
-python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl
-```
-
-After you get pose annotations for all videos in a dataset split, like `ntu60_xsub_val`. You can gather them into a single list and save the list as `ntu60_xsub_val.pkl`. You can use those larger pickle files for training and testing.
+- NTURGB+D \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_2d.pkl
+- NTURGB+D \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_3d.pkl
+- NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl
+- NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl
+- GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl
+  - GYM 2D skeletons are extracted with ground-truth human bounding boxes, which can be downloaded with link: https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl. Please cite [PoseConv3D](https://arxiv.org/abs/2104.13586) if you use it in your project.
+- UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl
+- HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl
+- Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl
+- Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (Table of contents only, no skeleton annotations)
 
-## The Format of PoseC3D Annotations
+For Kinetics400, since the skeleton annotations are large, we do not provide the direct download links on aliyun. Please use the following link to download the `kpfiles` and extract it under `$MMACTION2/data/k400` for Kinetics-400 training & testing: https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EeyDCVskqLtClMVVwqD53acBF2FEwkctp3vtRbkLfnKSTw?e=B3SZlM
 
-Here we briefly introduce the format of PoseC3D Annotations, we will take `gym_train.pkl` as an example: the content of `gym_train.pkl` is a list of length 20484, each item is a dictionary that is the skeleton annotation of one video. Each dictionary has following fields:
+## The Format of Annotations
 
-- keypoint: The keypoint coordinates, which is a numpy array of the shape N (#person) x T (temporal length) x K (#keypoints, 17 in our case) x 2 (x, y coordinate).
-- keypoint_score:  The keypoint confidence scores, which is a numpy array of the shape N (#person) x T (temporal length) x K (#keypoints, 17 in our case).
-- frame_dir: The corresponding video name.
-- label: The action category.
-- img_shape: The image shape of each frame.
-- original_shape: Same as above.
-- total_frames: The temporal length of the video.
+Each pickle file corresponds to an action recognition dataset. The content of a pickle file is a dictionary with two fields: `split` and `annotations`
 
-For training with your custom dataset, you can refer to [Custom Dataset Training](https://github.com/open-mmlab/mmaction2/blob/master/configs/skeleton/posec3d/custom_dataset_training.md).
+1. Split: The value of the `split` field is a dictionary: the keys are the split names, while the values are lists of video identifiers that belong to the specific clip.
+2. Annotations: The value of the `annotations` field is a list of skeleton annotations, each skeleton annotation is a dictionary, containing the following fields:
+   1. `frame_dir` (str): The identifier of the corresponding video.
+   2. `total_frames` (int): The number of frames in this video.
+   3. `img_shape` (tuple\[int\]): The shape of a video frame, a tuple with two elements, in the format of (height, width). Only required for 2D skeletons.
+   4. `original_shape` (tuple\[int\]): Same as `img_shape`.
+   5. `label` (int): The action label.
+   6. `keypoint` (np.ndarray, with shape \[M x T x V x C\]): The keypoint annotation. M: number of persons; T: number of frames (same as `total_frames`); V: number of keypoints (25 for NTURGB+D 3D skeleton, 17 for CoCo, 18 for OpenPose, etc. ); C: number of dimensions for keypoint coordinates (C=2 for 2D keypoint, C=3 for 3D keypoint).
+   7. `keypoint_score` (np.ndarray, with shape \[M x T x V\]): The confidence score of keypoints. Only required for 2D skeletons.
 
 ## Visualization
 
@@ -128,4 +118,4 @@ We provide scripts to convert skeleton annotations from third-party projects to
 - [x] NTU120_XSet
 - [x] UCF101
 - [x] HMDB51
-- [ ] Kinetics
+- [x] Kinetics
diff --git a/tools/data/skeleton/download_annotations.sh b/tools/data/skeleton/download_annotations.sh
deleted file mode 100644
index d57efbceac..0000000000
--- a/tools/data/skeleton/download_annotations.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-DATASET=$1
-if [ "$DATASET" == "gym" ]; then
-        echo "We are processing $DATASET"
-else
-        echo "Bad Argument, we only support gym now."
-        exit 0
-fi
-
-DATA_DIR="../../../data/posec3d/"
-
-if [[ ! -d "${DATA_DIR}" ]]; then
-  echo "${DATA_DIR} does not exist. Creating";
-  mkdir -p ${DATA_DIR}
-fi
-
-wget https://download.openmmlab.com/mmaction/posec3d/${DATASET}_train.pkl
-wget https://download.openmmlab.com/mmaction/posec3d/${DATASET}_val.pkl
-
-mv ${DATASET}_train.pkl ${DATA_DIR}
-mv ${DATASET}_val.pkl ${DATA_DIR}

From 14561295381ac9ad4350724ec78dcbfefe7f7f7d Mon Sep 17 00:00:00 2001
From: wxDai <daiwenxun@pjlab.org.cn>
Date: Mon, 20 Feb 2023 15:51:58 +0800
Subject: [PATCH 05/36] fix aliases (#2241)

---
 demo/demo_spatiotemporal_det.py                          | 2 +-
 demo/demo_spatiotemporal_det_onnx.py                     | 2 +-
 mmaction/datasets/transforms/pose_transforms.py          | 2 +-
 mmaction/evaluation/functional/accuracy.py               | 4 ++--
 mmaction/evaluation/functional/ava_evaluation/metrics.py | 4 ++--
 mmaction/evaluation/functional/eval_detection.py         | 4 ++--
 tests/evaluation/metrics/test_metric_utils.py            | 2 +-
 tools/data/activitynet/process_annotations.py            | 6 +++---
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/demo/demo_spatiotemporal_det.py b/demo/demo_spatiotemporal_det.py
index 009a9475a6..5ec42e7856 100644
--- a/demo/demo_spatiotemporal_det.py
+++ b/demo/demo_spatiotemporal_det.py
@@ -378,7 +378,7 @@ def dense_timestamps(timestamps, n):
         start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
         new_frame_inds = np.arange(
             len(timestamps) * n) * old_frame_interval / n + start
-        return new_frame_inds.astype(np.int)
+        return new_frame_inds.astype(np.int64)
 
     dense_n = int(args.predict_stepsize / args.output_stepsize)
     frames = [
diff --git a/demo/demo_spatiotemporal_det_onnx.py b/demo/demo_spatiotemporal_det_onnx.py
index d1ee9f0edc..6e5394e173 100644
--- a/demo/demo_spatiotemporal_det_onnx.py
+++ b/demo/demo_spatiotemporal_det_onnx.py
@@ -361,7 +361,7 @@ def dense_timestamps(timestamps, n):
         start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
         new_frame_inds = np.arange(
             len(timestamps) * n) * old_frame_interval / n + start
-        return new_frame_inds.astype(np.int)
+        return new_frame_inds.astype(np.int64)
 
     dense_n = int(args.predict_stepsize / args.output_stepsize)
     frames = [
diff --git a/mmaction/datasets/transforms/pose_transforms.py b/mmaction/datasets/transforms/pose_transforms.py
index 1740a18575..cff9f90112 100644
--- a/mmaction/datasets/transforms/pose_transforms.py
+++ b/mmaction/datasets/transforms/pose_transforms.py
@@ -1157,7 +1157,7 @@ def transform(self, results: Dict) -> Dict:
                     transitional[i] = transitional[i - 1] = True
                 if num_persons[i] != num_persons[i + 1]:
                     transitional[i] = transitional[i + 1] = True
-            inds_int = inds.astype(np.int)
+            inds_int = inds.astype(np.int64)
             coeff = np.array([transitional[i] for i in inds_int])
             inds = (coeff * inds_int + (1 - coeff) * inds).astype(np.float32)
 
diff --git a/mmaction/evaluation/functional/accuracy.py b/mmaction/evaluation/functional/accuracy.py
index 4b7f6dd52a..aa28bd486b 100644
--- a/mmaction/evaluation/functional/accuracy.py
+++ b/mmaction/evaluation/functional/accuracy.py
@@ -166,7 +166,7 @@ def mmit_mean_average_precision(scores, labels):
             sample.
 
     Returns:
-        np.float: The MMIT style mean average precision.
+        np.float64: The MMIT style mean average precision.
     """
     results = []
     for score, label in zip(scores, labels):
@@ -186,7 +186,7 @@ def mean_average_precision(scores, labels):
             sample.
 
     Returns:
-        np.float: The mean average precision.
+        np.float64: The mean average precision.
     """
     results = []
     scores = np.stack(scores).T
diff --git a/mmaction/evaluation/functional/ava_evaluation/metrics.py b/mmaction/evaluation/functional/ava_evaluation/metrics.py
index 4d566accb5..ffbe589454 100644
--- a/mmaction/evaluation/functional/ava_evaluation/metrics.py
+++ b/mmaction/evaluation/functional/ava_evaluation/metrics.py
@@ -35,7 +35,7 @@ def compute_precision_recall(scores, labels, num_gt):
             instances. This value is None if no ground truth labels are
             present.
     """
-    if (not isinstance(labels, np.ndarray) or labels.dtype != np.bool
+    if (not isinstance(labels, np.ndarray) or labels.dtype != bool
             or len(labels.shape) != 1):
         raise ValueError('labels must be single dimension bool numpy array')
 
@@ -90,7 +90,7 @@ def compute_average_precision(precision, recall):
     if not isinstance(precision, np.ndarray) or not isinstance(
             recall, np.ndarray):
         raise ValueError('precision and recall must be numpy array')
-    if precision.dtype != np.float or recall.dtype != np.float:
+    if precision.dtype != np.float64 or recall.dtype != np.float64:
         raise ValueError('input must be float numpy array.')
     if len(precision) != len(recall):
         raise ValueError('precision and recall must be of the same size.')
diff --git a/mmaction/evaluation/functional/eval_detection.py b/mmaction/evaluation/functional/eval_detection.py
index 2af3ada0db..b081d52b9b 100644
--- a/mmaction/evaluation/functional/eval_detection.py
+++ b/mmaction/evaluation/functional/eval_detection.py
@@ -220,8 +220,8 @@ def compute_average_precision_detection(ground_truth,
             if fp[t_idx, idx] == 0 and tp[t_idx, idx] == 0:
                 fp[t_idx, idx] = 1
 
-    tp_cumsum = np.cumsum(tp, axis=1).astype(np.float)
-    fp_cumsum = np.cumsum(fp, axis=1).astype(np.float)
+    tp_cumsum = np.cumsum(tp, axis=1).astype(np.float64)
+    fp_cumsum = np.cumsum(fp, axis=1).astype(np.float64)
     recall_cumsum = tp_cumsum / num_positive
 
     precision_cumsum = tp_cumsum / (tp_cumsum + fp_cumsum)
diff --git a/tests/evaluation/metrics/test_metric_utils.py b/tests/evaluation/metrics/test_metric_utils.py
index 091a728bc4..5eeb12e199 100644
--- a/tests/evaluation/metrics/test_metric_utils.py
+++ b/tests/evaluation/metrics/test_metric_utils.py
@@ -151,7 +151,7 @@ def gt_confusion_matrix(gt_labels, pred_labels, normalize=None):
     confusion_mat = np.delete(confusion_mat, del_index, axis=1)
 
     if normalize is not None:
-        confusion_mat = np.array(confusion_mat, dtype=np.float)
+        confusion_mat = np.array(confusion_mat, dtype=np.float64)
     m, n = confusion_mat.shape
     if normalize == 'true':
         for i in range(m):
diff --git a/tools/data/activitynet/process_annotations.py b/tools/data/activitynet/process_annotations.py
index 09ed5b5c8f..9374281a64 100644
--- a/tools/data/activitynet/process_annotations.py
+++ b/tools/data/activitynet/process_annotations.py
@@ -18,7 +18,7 @@ def load_json(file):
 
 anno_database = load_json(ann_file)
 
-video_record = np.loadtxt(info_file, dtype=np.str, delimiter=',', skiprows=1)
+video_record = np.loadtxt(info_file, dtype=str, delimiter=',', skiprows=1)
 
 video_dict_train = {}
 video_dict_val = {}
@@ -29,8 +29,8 @@ def load_json(file):
     video_name = video_item[0]
     video_info = anno_database[video_name]
     video_subset = video_item[5]
-    video_info['fps'] = video_item[3].astype(np.float)
-    video_info['rfps'] = video_item[4].astype(np.float)
+    video_info['fps'] = video_item[3].astype(np.float64)
+    video_info['rfps'] = video_item[4].astype(np.float64)
     video_dict_full[video_name] = video_info
     if video_subset == 'training':
         video_dict_train[video_name] = video_info

From c99ad659aeb8a860823d2536dfbe0ad18943dae5 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Mon, 20 Feb 2023 19:28:53 +0800
Subject: [PATCH 06/36] [Enhance] use tmpfile to avoid remaining tmp directory
 when demo break (#2236)

---
 demo/README.md                       |  1 +
 demo/demo_skeleton.py                | 11 ++++----
 demo/demo_spatiotemporal_det.py      | 38 +++++-----------------------
 demo/demo_spatiotemporal_det_onnx.py | 38 +++++-----------------------
 mmaction/utils/misc.py               | 24 +++++++++++-------
 tests/utils/test_misc.py             | 22 ++++++++++++++++
 6 files changed, 56 insertions(+), 78 deletions(-)
 create mode 100644 tests/utils/test_misc.py

diff --git a/demo/README.md b/demo/README.md
index 3252da35be..447789d37d 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -8,6 +8,7 @@
 - [Webcam demo](#webcam-demo): A demo script to implement real-time action recognition from a web camera.
 - [Skeleton-based Action Recognition Demo](#skeleton-based-action-recognition-demo): A demo script to predict the skeleton-based action recognition result using a single video.
 - [SpatioTemporal Action Detection Video Demo](#spatiotemporal-action-detection-video-demo): A demo script to predict the spatiotemporal action detection result using a single video.
+- [SpatioTemporal Action Detection ONNX Video Demo](#spatiotemporal-action-detection-onnx-video-demo): A demo script to predict the SpatioTemporal Action Detection result using the onnx file instead of building the PyTorch models.
 - [Inferencer Demo](#inferencer): A demo script to implement fast predict for video analysis tasks based on unified inferencer interface.
 
 ## Modify configs through script arguments
diff --git a/demo/demo_skeleton.py b/demo/demo_skeleton.py
index 57c84c90a3..3dc1fb215a 100644
--- a/demo/demo_skeleton.py
+++ b/demo/demo_skeleton.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
-import os.path as osp
-import shutil
+import tempfile
 
 import cv2
 import mmcv
@@ -128,7 +127,10 @@ def visualize(args, frames, data_samples, action_label):
 
 def main():
     args = parse_args()
-    frame_paths, frames = frame_extract(args.video, args.short_side)
+
+    tmp_dir = tempfile.TemporaryDirectory()
+    frame_paths, frames = frame_extract(args.video, args.short_side,
+                                        tmp_dir.name)
 
     num_frame = len(frame_paths)
     h, w, _ = frames[0].shape
@@ -180,8 +182,7 @@ def main():
 
     visualize(args, frames, pose_data_samples, action_label)
 
-    tmp_frame_dir = osp.dirname(frame_paths[0])
-    shutil.rmtree(tmp_frame_dir)
+    tmp_dir.cleanup()
 
 
 if __name__ == '__main__':
diff --git a/demo/demo_spatiotemporal_det.py b/demo/demo_spatiotemporal_det.py
index 5ec42e7856..0c5091dab2 100644
--- a/demo/demo_spatiotemporal_det.py
+++ b/demo/demo_spatiotemporal_det.py
@@ -1,9 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
 import copy as cp
-import os
-import os.path as osp
-import shutil
+import tempfile
 
 import cv2
 import mmcv
@@ -17,6 +15,7 @@
 from mmaction.apis import detection_inference
 from mmaction.registry import MODELS
 from mmaction.structures import ActionDataSample
+from mmaction.utils import frame_extract
 
 try:
     import moviepy.editor as mpy
@@ -101,32 +100,6 @@ def visualize(frames, annotations, plate=plate_blue, max_num=5):
     return frames_out
 
 
-def frame_extraction(video_path):
-    """Extract frames given video_path.
-
-    Args:
-        video_path (str): The video_path.
-    """
-    # Load the video, extract frames into ./tmp/video_name
-    target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
-    os.makedirs(target_dir, exist_ok=True)
-    # Should be able to handle videos up to several hours
-    frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
-    vid = cv2.VideoCapture(video_path)
-    frames = []
-    frame_paths = []
-    flag, frame = vid.read()
-    cnt = 0
-    while flag:
-        frames.append(frame)
-        frame_path = frame_tmpl.format(cnt + 1)
-        frame_paths.append(frame_path)
-        cv2.imwrite(frame_path, frame)
-        cnt += 1
-        flag, frame = vid.read()
-    return frame_paths, frames
-
-
 def load_label_map(file_path):
     """Load Label Map.
 
@@ -259,7 +232,9 @@ def parse_args():
 def main():
     args = parse_args()
 
-    frame_paths, original_frames = frame_extraction(args.video)
+    tmp_dir = tempfile.TemporaryDirectory()
+    frame_paths, original_frames = frame_extract(
+        args.video, out_dir=tmp_dir.name)
     num_frame = len(frame_paths)
     h, w, _ = original_frames[0].shape
 
@@ -391,8 +366,7 @@ def dense_timestamps(timestamps, n):
                                 fps=args.output_fps)
     vid.write_videofile(args.out_filename)
 
-    tmp_frame_dir = osp.dirname(frame_paths[0])
-    shutil.rmtree(tmp_frame_dir)
+    tmp_dir.cleanup()
 
 
 if __name__ == '__main__':
diff --git a/demo/demo_spatiotemporal_det_onnx.py b/demo/demo_spatiotemporal_det_onnx.py
index 6e5394e173..7c40e9c64e 100644
--- a/demo/demo_spatiotemporal_det_onnx.py
+++ b/demo/demo_spatiotemporal_det_onnx.py
@@ -1,9 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
 import copy as cp
-import os
-import os.path as osp
-import shutil
+import tempfile
 
 import cv2
 import mmcv
@@ -15,6 +13,7 @@
 from mmengine import DictAction
 
 from mmaction.apis import detection_inference
+from mmaction.utils import frame_extract
 
 try:
     import moviepy.editor as mpy
@@ -99,32 +98,6 @@ def visualize(frames, annotations, plate=plate_blue, max_num=5):
     return frames_out
 
 
-def frame_extraction(video_path):
-    """Extract frames given video_path.
-
-    Args:
-        video_path (str): The video_path.
-    """
-    # Load the video, extract frames into ./tmp/video_name
-    target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
-    os.makedirs(target_dir, exist_ok=True)
-    # Should be able to handle videos up to several hours
-    frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
-    vid = cv2.VideoCapture(video_path)
-    frames = []
-    frame_paths = []
-    flag, frame = vid.read()
-    cnt = 0
-    while flag:
-        frames.append(frame)
-        frame_path = frame_tmpl.format(cnt + 1)
-        frame_paths.append(frame_path)
-        cv2.imwrite(frame_path, frame)
-        cnt += 1
-        flag, frame = vid.read()
-    return frame_paths, frames
-
-
 def load_label_map(file_path):
     """Load Label Map.
 
@@ -253,7 +226,9 @@ def parse_args():
 def main():
     args = parse_args()
 
-    frame_paths, original_frames = frame_extraction(args.video)
+    tmp_dir = tempfile.TemporaryDirectory()
+    frame_paths, original_frames = frame_extract(
+        args.video, out_dir=tmp_dir.name)
     num_frame = len(frame_paths)
     h, w, _ = original_frames[0].shape
 
@@ -374,8 +349,7 @@ def dense_timestamps(timestamps, n):
                                 fps=args.output_fps)
     vid.write_videofile(args.out_filename)
 
-    tmp_frame_dir = osp.dirname(frame_paths[0])
-    shutil.rmtree(tmp_frame_dir)
+    tmp_dir.cleanup()
 
 
 if __name__ == '__main__':
diff --git a/mmaction/utils/misc.py b/mmaction/utils/misc.py
index 3c34df3f68..f14b8a51c2 100644
--- a/mmaction/utils/misc.py
+++ b/mmaction/utils/misc.py
@@ -4,6 +4,7 @@
 import os.path as osp
 import random
 import string
+from typing import Optional
 
 import cv2
 import mmcv
@@ -33,18 +34,23 @@ def get_shm_dir() -> str:
     return '/dev/shm'
 
 
-def frame_extract(video_path: str, short_side: int):
+def frame_extract(video_path: str,
+                  short_side: Optional[int] = None,
+                  out_dir: str = './tmp'):
     """Extract frames given video_path.
 
     Args:
         video_path (str): The video path.
-        short_side (int): The short-side of the image.
+        short_side (int): Target short-side of the output image.
+            Defaults to None, means keep original shape.
+        out_dir (str): The output directory. Defaults to ``'./tmp'``.
     """
-    # Load the video, extract frames into ./tmp/video_name
-    target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
+    # Load the video, extract frames into OUT_DIR/video_name
+    target_dir = osp.join(out_dir, osp.basename(osp.splitext(video_path)[0]))
     os.makedirs(target_dir, exist_ok=True)
     # Should be able to handle videos up to several hours
     frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
+    assert osp.exists(video_path), f'file not exit {video_path}'
     vid = cv2.VideoCapture(video_path)
     frames = []
     frame_paths = []
@@ -52,11 +58,11 @@ def frame_extract(video_path: str, short_side: int):
     cnt = 0
     new_h, new_w = None, None
     while flag:
-        if new_h is None:
-            h, w, _ = frame.shape
-            new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf))
-
-        frame = mmcv.imresize(frame, (new_w, new_h))
+        if short_side is not None:
+            if new_h is None:
+                h, w, _ = frame.shape
+                new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf))
+            frame = mmcv.imresize(frame, (new_w, new_h))
 
         frames.append(frame)
         frame_path = frame_tmpl.format(cnt + 1)
diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py
new file mode 100644
index 0000000000..e0886162a6
--- /dev/null
+++ b/tests/utils/test_misc.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from tempfile import TemporaryDirectory
+
+from mmaction.utils import frame_extract
+
+
+def test_frame_extract():
+    data_prefix = osp.normpath(osp.join(osp.dirname(__file__), '../data'))
+    video_path = osp.join(data_prefix, 'test.mp4')
+    with TemporaryDirectory() as tmp_dir:
+        # assign short_side
+        frame_paths, frames = frame_extract(
+            video_path, short_side=100, out_dir=tmp_dir)
+        assert osp.exists(tmp_dir) and \
+            len(os.listdir(f'{tmp_dir}/test')) == len(frame_paths)
+        assert min(frames[0].shape[:2]) == 100
+        # default short_side
+        frame_paths, frames = frame_extract(video_path, out_dir=tmp_dir)
+        assert osp.exists(tmp_dir) and \
+            len(os.listdir(f'{tmp_dir}/test')) == len(frame_paths)

From 3807cb3db59eaa063bef76243913a3ae291720c0 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Fri, 24 Feb 2023 17:42:42 +0800
Subject: [PATCH 07/36] [fix] rename fps in DecordInit to avoid overwriting fps
 in SampleAVAFrame (#2251)

---
 mmaction/datasets/transforms/loading.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py
index 8305a490b8..9e66cd7f3f 100644
--- a/mmaction/datasets/transforms/loading.py
+++ b/mmaction/datasets/transforms/loading.py
@@ -356,7 +356,7 @@ def transform(self, results: dict) -> dict:
         total_frames = results['total_frames']
         # if can't get fps, same value of `fps` and `target_fps`
         # will perform nothing
-        fps = results.get('fps')
+        fps = results.get('avg_fps')
         if self.target_fps is None or not fps:
             fps_scale_ratio = 1.0
         else:
@@ -1111,7 +1111,7 @@ def transform(self, results):
 
         file_obj = io.BytesIO(self.file_client.get(results['filename']))
         container = decord.VideoReader(file_obj, num_threads=self.num_threads)
-        results['fps'] = container.get_avg_fps()
+        results['avg_fps'] = container.get_avg_fps()
         results['video_reader'] = container
         results['total_frames'] = len(container)
         return results

From 951dfc0ff8f4e30fe35e4e093690bf272fbae1e1 Mon Sep 17 00:00:00 2001
From: wxDai <daiwenxun@pjlab.org.cn>
Date: Wed, 1 Mar 2023 16:10:46 +0800
Subject: [PATCH 08/36] [Feature] Support RGBPoseConv3D (#2182)

---
 configs/skeleton/posec3d/README.md            |  29 +-
 configs/skeleton/posec3d/metafile.yml         |  26 +-
 .../skeleton/posec3d/rgbpose_conv3d/README.md | 107 ++++
 .../rgbpose_conv3d/merge_pretrain.ipynb       | 267 +++++++++
 .../posec3d/rgbpose_conv3d/pose_only.py       | 127 +++++
 .../posec3d/rgbpose_conv3d/rgb_only.py        | 126 ++++
 .../posec3d/rgbpose_conv3d/rgbpose_conv3d.py  | 190 +++++++
 ...0_8xb16-u48-120e_hmdb51-split1-keypoint.py |   8 +-
 ...0_8xb16-u48-120e_ucf101-split1-keypoint.py |   8 +-
 ...lowonly_r50_8xb16-u48-240e_gym-keypoint.py |  33 +-
 .../slowonly_r50_8xb16-u48-240e_gym-limb.py   |  39 +-
 ..._r50_8xb16-u48-240e_ntu60-xsub-keypoint.py |  36 +-
 ...only_r50_8xb16-u48-240e_ntu60-xsub-limb.py |  42 +-
 mmaction/datasets/pose_dataset.py             |  52 +-
 mmaction/datasets/transforms/__init__.py      |   6 +-
 mmaction/datasets/transforms/formatting.py    | 121 +++-
 mmaction/datasets/transforms/loading.py       | 107 ++--
 .../datasets/transforms/pose_transforms.py    | 538 +++++++++++++-----
 mmaction/evaluation/metrics/acc_metric.py     |  71 ++-
 mmaction/models/backbones/__init__.py         |   4 +-
 mmaction/models/backbones/resnet3d.py         | 204 +++----
 .../models/backbones/resnet3d_slowfast.py     | 384 ++++++-------
 .../models/backbones/resnet3d_slowonly.py     |   6 -
 mmaction/models/backbones/rgbposeconv3d.py    | 205 +++++++
 .../models/data_preprocessors/__init__.py     |   3 +-
 .../data_preprocessors/data_preprocessor.py   |  41 +-
 .../multimodal_data_preprocessor.py           |  42 ++
 mmaction/models/heads/__init__.py             |   3 +-
 mmaction/models/heads/base.py                 |  87 +--
 mmaction/models/heads/rgbpose_head.py         | 240 ++++++++
 mmaction/models/recognizers/__init__.py       |   3 +-
 .../models/recognizers/recognizer3d_mm.py     |  50 ++
 mmaction/models/utils/blending_utils.py       |  45 +-
 tests/datasets/transforms/test_formating.py   |  24 +-
 tests/datasets/transforms/test_loading.py     |  16 +-
 .../transforms/test_pose_transforms.py        | 218 +++++--
 .../backbones/test_resnet3d_slowfast.py       |  29 +-
 .../backbones/test_resnet3d_slowonly.py       |   4 +-
 tests/models/backbones/test_rgbposeconv3d.py  |  27 +
 tests/models/data_preprocessors/__init__.py   |   1 +
 .../test_data_preprocessor.py                 |  97 ++++
 .../test_multimodal_data_preprocessor.py      |  94 +++
 tests/models/heads/test_rgbpose_head.py       |  41 ++
 tools/data/skeleton/compress_nturgbd.py       |  42 ++
 44 files changed, 3039 insertions(+), 804 deletions(-)
 create mode 100644 configs/skeleton/posec3d/rgbpose_conv3d/README.md
 create mode 100644 configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb
 create mode 100644 configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py
 create mode 100644 configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py
 create mode 100644 configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py
 create mode 100644 mmaction/models/backbones/rgbposeconv3d.py
 create mode 100644 mmaction/models/data_preprocessors/multimodal_data_preprocessor.py
 create mode 100644 mmaction/models/heads/rgbpose_head.py
 create mode 100644 mmaction/models/recognizers/recognizer3d_mm.py
 create mode 100644 tests/models/backbones/test_rgbposeconv3d.py
 create mode 100644 tests/models/data_preprocessors/__init__.py
 create mode 100644 tests/models/data_preprocessors/test_data_preprocessor.py
 create mode 100644 tests/models/data_preprocessors/test_multimodal_data_preprocessor.py
 create mode 100644 tests/models/heads/test_rgbpose_head.py
 create mode 100644 tools/data/skeleton/compress_nturgbd.py

diff --git a/configs/skeleton/posec3d/README.md b/configs/skeleton/posec3d/README.md
index 2fe5f579f0..0e45528345 100644
--- a/configs/skeleton/posec3d/README.md
+++ b/configs/skeleton/posec3d/README.md
@@ -54,29 +54,30 @@ Human skeleton, as a compact representation of human action, has received increa
 
 ### FineGYM
 
-| frame sampling strategy | pseudo heatmap | gpus |   backbone   | Mean Top-1 | testing protocol  |                  config                   |                   ckpt                   |                   log                   |
-| :---------------------: | :------------: | :--: | :----------: | :--------: | :---------------: | :---------------------------------------: | :--------------------------------------: | :-------------------------------------: |
-|       uniform 48        |    keypoint    |  8   | SlowOnly-R50 |    93.4    | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint_20220815-da338c58.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint.log) |
-|       uniform 48        |      limb      |  8   | SlowOnly-R50 |    93.7    | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb_20220815-2e6e3c5c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb.log) |
+| frame sampling strategy | pseudo heatmap | gpus |   backbone   | Mean Top-1 | testing protocol | FLOPs | params |                 config                 |                 ckpt                 |                 log                  |
+| :---------------------: | :------------: | :--: | :----------: | :--------: | :--------------: | :---: | :----: | :------------------------------------: | :----------------------------------: | :----------------------------------: |
+|       uniform 48        |    keypoint    |  8   | SlowOnly-R50 |    93.5    |     10 clips     | 20.6G |  2.0M  | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint_20220815-da338c58.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint.log) |
+|       uniform 48        |      limb      |  8   | SlowOnly-R50 |    93.6    |     10 clips     | 20.6G |  2.0M  | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb_20220815-2e6e3c5c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb.log) |
 
 ### NTU60_XSub
 
-| frame sampling strategy | pseudo heatmap | gpus |   backbone   | top1 acc | testing protocol  |                   config                   |                   ckpt                   |                   log                    |
-| :---------------------: | :------------: | :--: | :----------: | :------: | :---------------: | :----------------------------------------: | :--------------------------------------: | :--------------------------------------: |
-|       uniform 48        |    keypoint    |  8   | SlowOnly-R50 |   93.6   | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint_20220815-38db104b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.log) |
-|       uniform 48        |      limb      |  8   | SlowOnly-R50 |   93.5   | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb_20220815-af2f119a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.log) |
+| frame sampling strategy | pseudo heatmap | gpus |   backbone   | top1 acc | testing protocol | FLOPs | params |                 config                  |                 ckpt                  |                 log                  |
+| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: |
+|       uniform 48        |    keypoint    |  8   | SlowOnly-R50 |   93.6   |     10 clips     | 20.6G |  2.0M  | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint_20220815-38db104b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.log) |
+|       uniform 48        |      limb      |  8   | SlowOnly-R50 |   93.5   |     10 clips     | 20.6G |  2.0M  | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb_20220815-af2f119a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.log) |
+|                         |     Fusion     |      |              |   94.0   |                  |       |        |                                         |                                       |                                      |
 
 ### UCF101
 
-| frame sampling strategy | pseudo heatmap | gpus |   backbone   | top1 acc | testing protocol  |                   config                   |                   ckpt                   |                   log                    |
-| :---------------------: | :------------: | :--: | :----------: | :------: | :---------------: | :----------------------------------------: | :--------------------------------------: | :--------------------------------------: |
-|       uniform 48        |    keypoint    |  8   | SlowOnly-R50 |   86.9   | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint_20220815-9972260d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.log) |
+| frame sampling strategy | pseudo heatmap | gpus |   backbone   | top1 acc | testing protocol | FLOPs | params |                 config                  |                 ckpt                  |                 log                  |
+| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: |
+|       uniform 48        |    keypoint    |  8   | SlowOnly-R50 |   86.8   |     10 clips     | 14.6G |  3.1M  | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint_20220815-9972260d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.log) |
 
 ### HMDB51
 
-| frame sampling strategy | pseudo heatmap | gpus |   backbone   | top1 acc | testing protocol  |                   config                   |                   ckpt                   |                   log                    |
-| :---------------------: | :------------: | :--: | :----------: | :------: | :---------------: | :----------------------------------------: | :--------------------------------------: | :--------------------------------------: |
-|       uniform 48        |    keypoint    |  8   | SlowOnly-R50 |   69.2   | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint_20220815-17eaa484.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.log) |
+| frame sampling strategy | pseudo heatmap | gpus |   backbone   | top1 acc | testing protocol | FLOPs | params |                 config                  |                 ckpt                  |                 log                  |
+| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: |
+|       uniform 48        |    keypoint    |  8   | SlowOnly-R50 |   69.6   |     10 clips     | 14.6G |  3.0M  | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint_20220815-17eaa484.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.log) |
 
 1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
    According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
diff --git a/configs/skeleton/posec3d/metafile.yml b/configs/skeleton/posec3d/metafile.yml
index 7a3d3b9b20..b949a23d47 100644
--- a/configs/skeleton/posec3d/metafile.yml
+++ b/configs/skeleton/posec3d/metafile.yml
@@ -13,7 +13,8 @@ Models:
       Architecture: SlowOnly-R50
       Batch Size: 16
       Epochs: 240
-      Parameters: 2044867
+      FLOPs: 20.6G
+      Parameters: 2.0M
       Training Data: FineGYM
       Training Resources: 8 GPUs
       pseudo heatmap: keypoint
@@ -21,7 +22,7 @@ Models:
     - Dataset: FineGYM
       Task: Skeleton-based Action Recognition
       Metrics:
-        mean Top 1 Accuracy: 93.4
+        mean Top 1 Accuracy: 93.5
     Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint_20220815-da338c58.pth
 
@@ -32,7 +33,8 @@ Models:
       Architecture: SlowOnly-R50
       Batch Size: 16
       Epochs: 240
-      Parameters: 2044867
+      FLOPs: 20.6G
+      Parameters: 2.0M
       Training Data: FineGYM
       Training Resources: 8 GPUs
       pseudo heatmap: limb
@@ -40,7 +42,7 @@ Models:
     - Dataset: FineGYM
       Task: Skeleton-based Action Recognition
       Metrics:
-        mean Top 1 Accuracy: 93.7
+        mean Top 1 Accuracy: 93.6
     Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb_20220815-2e6e3c5c.pth
 
@@ -51,7 +53,8 @@ Models:
       Architecture: SlowOnly-R50
       Batch Size: 16
       Epochs: 240
-      Parameters: 2024860
+      FLOPs: 20.6G
+      Parameters: 2.0M
       Training Data: NTU60-XSub
       Training Resources: 8 GPUs
       pseudo heatmap: keypoint
@@ -70,7 +73,8 @@ Models:
       Architecture: SlowOnly-R50
       Batch Size: 16
       Epochs: 240
-      Parameters: 2024860
+      FLOPs: 20.6G
+      Parameters: 2.0M
       Training Data: NTU60-XSub
       Training Resources: 8 GPUs
       pseudo heatmap: limb
@@ -89,7 +93,8 @@ Models:
       Architecture: SlowOnly-R50
       Batch Size: 16
       Epochs: 120
-      Parameters: 3029984
+      FLOPs: 14.6G
+      Parameters: 3.0M
       Training Data: HMDB51
       Training Resources: 8 GPUs
       pseudo heatmap: keypoint
@@ -97,7 +102,7 @@ Models:
     - Dataset: HMDB51
       Task: Skeleton-based Action Recognition
       Metrics:
-        Top 1 Accuracy: 69.2
+        Top 1 Accuracy: 69.6
     Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint_20220815-17eaa484.pth
 
@@ -108,7 +113,8 @@ Models:
       Architecture: SlowOnly-R50
       Batch Size: 16
       Epochs: 120
-      Parameters: 3055584
+      FLOPs: 14.6G
+      Parameters: 3.1M
       Training Data: UCF101
       Training Resources: 8 GPUs
       pseudo heatmap: keypoint
@@ -116,6 +122,6 @@ Models:
     - Dataset: UCF101
       Task: Skeleton-based Action Recognition
       Metrics:
-        Top 1 Accuracy: 86.9
+        Top 1 Accuracy: 86.8
     Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint_20220815-9972260d.pth
diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/README.md b/configs/skeleton/posec3d/rgbpose_conv3d/README.md
new file mode 100644
index 0000000000..37b4cd489d
--- /dev/null
+++ b/configs/skeleton/posec3d/rgbpose_conv3d/README.md
@@ -0,0 +1,107 @@
+# RGBPoseConv3D
+
+## Introduction
+
+RGBPoseConv3D is a framework that jointly use 2D human skeletons and RGB appearance for human action recognition. It is a 3D CNN with two streams, with the architecture borrowed from SlowFast. In RGBPoseConv3D:
+
+- The RGB stream corresponds to the `slow` stream in SlowFast; The Skeleton stream corresponds to the `fast` stream in SlowFast.
+- The input resolution of RGB frames is `4x` larger than the pseudo heatmaps.
+- Bilateral connections are used for early feature fusion between the two modalities.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/209961351-6def0074-9b05-43fc-8210-a1cdaaed6536.png" width=70%/>
+</div>
+
+## Citation
+
+```BibTeX
+@inproceedings{duan2022revisiting,
+  title={Revisiting skeleton-based action recognition},
+  author={Duan, Haodong and Zhao, Yue and Chen, Kai and Lin, Dahua and Dai, Bo},
+  booktitle={CVPR},
+  pages={2969--2978},
+  year={2022}
+}
+```
+
+## How to train RGBPoseConv3D (on NTURGB+D, for example)?
+
+#### Step 0. Data Preparation
+
+Besides the skeleton annotations, you also need RGB videos to train RGBPoseConv3D. You need to download them from the official website of [NTURGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) and put these videos in `$MMACTION2/data/nturgbd_raw`. After that, you need to use the provided script to compress the raw videos (from `1920x1080` to `960x540`) and switch the suffix to `.mp4`:
+
+```bash
+# That step is mandatory, unless you know how to modify the code & config to make it work for raw videos!
+python tools/data/skeleton/compress_nturgbd.py
+```
+
+After that, you will find processed videos in `$MMACTION2/data/nturgbd_videos`, named like `S001C001P001R001A001.mp4`.
+
+#### Step 1. Pretraining
+
+You first need to train the RGB-only and Pose-only model on the target dataset, the pretrained checkpoints will be used to initialize the RGBPoseConv3D model.
+
+You can either train these two models from scratch with provided configs files:
+
+```bash
+# We train each model for 180 epochs. By default, we use 8 GPUs.
+# Train the RGB-only model
+bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py 8
+# Train the Pose-only model
+bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py 8
+```
+
+or directly download and use the provided pretrain models:
+
+|    Dataset    |                                Config                                |                                 Checkpoint                                 | Top-1 (1 clip testing) | Top-1 (10 clip testing) |
+| :-----------: | :------------------------------------------------------------------: | :------------------------------------------------------------------------: | :--------------------: | :---------------------: |
+| NTURGB+D XSub |  [rgb_config](/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py)  | [rgb_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth) |          94.9          |          95.4           |
+| NTURGB+D XSub | [pose_config](/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py) | [pose_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth) |          93.1          |          93.5           |
+
+#### Step 2. Generate the initializing weight for RGBPoseConv3D
+
+You can use the provided [IPython notebook](/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb) to merge two pretrained models into a single `rgbpose_conv3d_init.pth`.
+
+You can do it your own or directly download and use the provided [rgbpose_conv3d_init.pth](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_init_20230228-09b7684b.pth).
+
+#### Step 3. Finetune RGBPoseConv3D
+
+You can use our provided config files to finetune RGBPoseConv3D, jointly with two modalities (RGB & Pose):
+
+```bash
+# We finetune RGBPoseConv3D for 20 epochs on NTURGB+D XSub (8 GPUs)
+bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py 8
+# After finetuning, you can test the model with the following command (8 GPUs)
+bash tools/dist_test.sh configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py $CKPT 8 --dump result.pkl
+```
+
+**Notes**
+
+1. We use linear scaling learning rate (`Initial LR` ∝ `Batch Size`). If you change the training batch size, remember to change the initial LR proportionally.
+
+2. Though optimized, multi-clip testing may consumes large amounts of time. For faster inference, you may change the test_pipeline to disable the multi-clip testing, this may lead to a small drop in recognition performance. Below is the guide:
+
+   ```python
+   test_pipeline = [
+       dict(type='MMUniformSampleFrames', clip_len=dict(RGB=8, Pose=32), num_clips=10, test_mode=True), # change `num_clips=10` to `num_clips=1`
+       dict(type='MMDecode'),
+       dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+       dict(type='Resize', scale=(256, 256), keep_ratio=False),
+       dict(type='GeneratePoseTarget', sigma=0.7, use_score=True, with_kp=True, with_limb=False, scaling=0.25),
+       dict(type='FormatShape', input_format='NCTHW'),
+       dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs'))
+   ]
+   ```
+
+## Results
+
+On action recognition with multiple modalities (RGB & Pose), RGBPoseConv3D can achieve better recognition performance than the late fusion baseline.
+
+|    Dataset    |        Fusion         |        Config         |         Checkpoint         | RGB Stream Top-1<br>(1-clip / 10-clip) | Pose Stream Top-1<br>(1-clip / 10-clip) | 2 Stream Top-1 (1:1)<br>(1-clip / 10-clip) |
+| :-----------: | :-------------------: | :-------------------: | :------------------------: | :------------------------------------: | :-------------------------------------: | :----------------------------------------: |
+| NTURGB+D XSub |      Late Fusion      | [rgb_config](/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py)<br>[pose_config](/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py) | [rgb_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth)<br>[pose_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth) |              94.9 / 95.4               |               93.1 / 93.5               |                96.0 / 96.2                 |
+| NTURGB+D XSub | Early Fusion + Late Fusion | [config](/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_20230301-ac7b0e77.pth) |              96.2 / 96.4               |               96.0 / 96.2               |                96.6 / 96.8                 |
+
+**Notes**
+
+For both `Late Fusion` and `Early Fusion + Late Fusion`, we combine the action scores based on two modalities with 1:1 ratio to get the final prediction.
diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb b/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb
new file mode 100644
index 0000000000..194ca28c31
--- /dev/null
+++ b/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb
@@ -0,0 +1,267 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import copy as cp\n",
+    "from collections import OrderedDict\n",
+    "\n",
+    "import torch\n",
+    "from mmengine.runner.checkpoint import _load_checkpoint\n",
+    "\n",
+    "from mmaction.utils import register_all_modules\n",
+    "from mmaction.registry import MODELS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "outputs": [],
+   "source": [
+    "backbone_cfg = dict(\n",
+    "    type='RGBPoseConv3D',\n",
+    "    speed_ratio=4,\n",
+    "    channel_ratio=4,\n",
+    "    rgb_pathway=dict(\n",
+    "        num_stages=4,\n",
+    "        lateral=True,\n",
+    "        lateral_infl=1,\n",
+    "        lateral_activate=[0, 0, 1, 1],\n",
+    "        fusion_kernel=7,\n",
+    "        base_channels=64,\n",
+    "        conv1_kernel=(1, 7, 7),\n",
+    "        inflate=(0, 0, 1, 1),\n",
+    "        with_pool2=False),\n",
+    "    pose_pathway=dict(\n",
+    "        num_stages=3,\n",
+    "        stage_blocks=(4, 6, 3),\n",
+    "        lateral=True,\n",
+    "        lateral_inv=True,\n",
+    "        lateral_infl=16,\n",
+    "        lateral_activate=(0, 1, 1),\n",
+    "        fusion_kernel=7,\n",
+    "        in_channels=17,\n",
+    "        base_channels=32,\n",
+    "        out_indices=(2, ),\n",
+    "        conv1_kernel=(1, 7, 7),\n",
+    "        conv1_stride_s=1,\n",
+    "        conv1_stride_t=1,\n",
+    "        pool1_stride_s=1,\n",
+    "        pool1_stride_t=1,\n",
+    "        inflate=(0, 1, 1),\n",
+    "        spatial_strides=(2, 2, 2),\n",
+    "        temporal_strides=(1, 1, 1),\n",
+    "        dilations=(1, 1, 1),\n",
+    "        with_pool2=False))\n",
+    "head_cfg = dict(\n",
+    "    type='RGBPoseHead',\n",
+    "    num_classes=60,\n",
+    "    in_channels=[2048, 512],\n",
+    "    average_clips='prob')\n",
+    "model_cfg = dict(\n",
+    "    type='Recognizer3D',\n",
+    "    backbone=backbone_cfg,\n",
+    "    cls_head=head_cfg)\n",
+    "\n",
+    "register_all_modules()\n",
+    "model = MODELS.build(model_cfg)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [],
+   "source": [
+    "# set your paths of the pretrained weights here\n",
+    "rgb_filepath = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth'\n",
+    "pose_filepath = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth'"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230226-8bd9d8df.pth\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230226-8bd9d8df.pth\" to C:\\Users\\wxDai/.cache\\torch\\hub\\checkpoints\\rgb_only_20230226-8bd9d8df.pth\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230226-fa40054e.pth\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230226-fa40054e.pth\" to C:\\Users\\wxDai/.cache\\torch\\hub\\checkpoints\\pose_only_20230226-fa40054e.pth\n"
+     ]
+    }
+   ],
+   "source": [
+    "rgb_ckpt = _load_checkpoint(rgb_filepath, map_location='cpu')['state_dict']\n",
+    "pose_ckpt = _load_checkpoint(pose_filepath, map_location='cpu')['state_dict']"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [],
+   "source": [
+    "rgb_ckpt = {k.replace('backbone', 'backbone.rgb_path').replace('fc_cls', 'fc_rgb'): v for k, v in rgb_ckpt.items()}\n",
+    "pose_ckpt = {k.replace('backbone', 'backbone.pose_path').replace('fc_cls', 'fc_pose'): v for k, v in pose_ckpt.items()}"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [],
+   "source": [
+    "old_ckpt = {}\n",
+    "old_ckpt.update(rgb_ckpt)\n",
+    "old_ckpt.update(pose_ckpt)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "outputs": [],
+   "source": [
+    "# The difference is in dim-1\n",
+    "def padding(weight, new_shape):\n",
+    "    new_weight = weight.new_zeros(new_shape)\n",
+    "    new_weight[:, :weight.shape[1]] = weight\n",
+    "    return new_weight"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "outputs": [],
+   "source": [
+    "ckpt = cp.deepcopy(old_ckpt)\n",
+    "name = 'backbone.rgb_path.layer3.0.conv1.conv.weight'\n",
+    "ckpt[name] = padding(ckpt[name], (256, 640, 3, 1, 1))\n",
+    "name = 'backbone.rgb_path.layer3.0.downsample.conv.weight'\n",
+    "ckpt[name] = padding(ckpt[name], (1024, 640, 1, 1, 1))\n",
+    "name = 'backbone.rgb_path.layer4.0.conv1.conv.weight'\n",
+    "ckpt[name] = padding(ckpt[name], (512, 1280, 3, 1, 1))\n",
+    "name = 'backbone.rgb_path.layer4.0.downsample.conv.weight'\n",
+    "ckpt[name] = padding(ckpt[name], (2048, 1280, 1, 1, 1))\n",
+    "name = 'backbone.pose_path.layer2.0.conv1.conv.weight'\n",
+    "ckpt[name] = padding(ckpt[name], (64, 160, 3, 1, 1))\n",
+    "name = 'backbone.pose_path.layer2.0.downsample.conv.weight'\n",
+    "ckpt[name] = padding(ckpt[name], (256, 160, 1, 1, 1))\n",
+    "name = 'backbone.pose_path.layer3.0.conv1.conv.weight'\n",
+    "ckpt[name] = padding(ckpt[name], (128, 320, 3, 1, 1))\n",
+    "name = 'backbone.pose_path.layer3.0.downsample.conv.weight'\n",
+    "ckpt[name] = padding(ckpt[name], (512, 320, 1, 1, 1))\n",
+    "ckpt = OrderedDict(ckpt)\n",
+    "torch.save({'state_dict': ckpt}, 'rgbpose_conv3d_init.pth')"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "_IncompatibleKeys(missing_keys=['backbone.rgb_path.layer2_lateral.conv.weight', 'backbone.rgb_path.layer3_lateral.conv.weight', 'backbone.pose_path.layer1_lateral.conv.weight', 'backbone.pose_path.layer1_lateral.bn.weight', 'backbone.pose_path.layer1_lateral.bn.bias', 'backbone.pose_path.layer1_lateral.bn.running_mean', 'backbone.pose_path.layer1_lateral.bn.running_var', 'backbone.pose_path.layer2_lateral.conv.weight', 'backbone.pose_path.layer2_lateral.bn.weight', 'backbone.pose_path.layer2_lateral.bn.bias', 'backbone.pose_path.layer2_lateral.bn.running_mean', 'backbone.pose_path.layer2_lateral.bn.running_var'], unexpected_keys=[])"
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.load_state_dict(ckpt, strict=False)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py b/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py
new file mode 100644
index 0000000000..ad413da6a6
--- /dev/null
+++ b/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py
@@ -0,0 +1,127 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        in_channels=17,
+        base_channels=32,
+        num_stages=3,
+        out_indices=(2, ),
+        stage_blocks=(4, 6, 3),
+        conv1_stride_s=1,
+        pool1_stride_s=1,
+        inflate=(0, 1, 1),
+        spatial_strides=(2, 2, 2),
+        temporal_strides=(1, 1, 1),
+        dilations=(1, 1, 1)),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=512,
+        num_classes=60,
+        dropout_ratio=0.5,
+        average_clips='prob'))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
+right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
+train_pipeline = [
+    dict(type='UniformSampleFrames', clip_len=32),
+    dict(type='PoseDecode'),
+    dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),
+    dict(type='Resize', scale=(64, 64), keep_ratio=False),
+    dict(type='RandomResizedCrop', area_range=(0.56, 1.0)),
+    dict(type='Resize', scale=(56, 56), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp),
+    dict(type='GeneratePoseTarget', with_kp=True, with_limb=False),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='UniformSampleFrames', clip_len=32, num_clips=1, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),
+    dict(type='Resize', scale=(64, 64), keep_ratio=False),
+    dict(type='GeneratePoseTarget', with_kp=True, with_limb=False),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(
+        type='UniformSampleFrames', clip_len=32, num_clips=10, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),
+    dict(type='Resize', scale=(64, 64), keep_ratio=False),
+    dict(
+        type='GeneratePoseTarget',
+        with_kp=True,
+        with_limb=False,
+        left_kp=left_kp,
+        right_kp=right_kp),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            split='xsub_train',
+            pipeline=train_pipeline)))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        split='xsub_val',
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        split='xsub_val',
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=18, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=18,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py b/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py
new file mode 100644
index 0000000000..331badaf8d
--- /dev/null
+++ b/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py
@@ -0,0 +1,126 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='ResNet3dSlowOnly',
+        depth=50,
+        conv1_kernel=(1, 7, 7),
+        inflate=(0, 0, 1, 1)),
+    cls_head=dict(
+        type='I3DHead',
+        in_channels=2048,
+        num_classes=60,
+        dropout_ratio=0.5,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'))
+
+dataset_type = 'PoseDataset'
+data_root = 'data/nturgbd_videos/'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+
+train_pipeline = [
+    dict(type='MMUniformSampleFrames', clip_len=dict(RGB=8), num_clips=1),
+    dict(type='MMDecode'),
+    dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+    dict(type='Resize', scale=(256, 256), keep_ratio=False),
+    dict(type='RandomResizedCrop', area_range=(0.56, 1.0)),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(
+        type='MMUniformSampleFrames',
+        clip_len=dict(RGB=8),
+        num_clips=1,
+        test_mode=True),
+    dict(type='MMDecode'),
+    dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(
+        type='MMUniformSampleFrames',
+        clip_len=dict(RGB=8),
+        num_clips=10,
+        test_mode=True),
+    dict(type='MMDecode'),
+    dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=12,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            data_prefix=dict(video=data_root),
+            split='xsub_train',
+            pipeline=train_pipeline)))
+val_dataloader = dict(
+    batch_size=12,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        data_prefix=dict(video=data_root),
+        split='xsub_val',
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        data_prefix=dict(video=data_root),
+        split='xsub_val',
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=18, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=18,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (12 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=96)
diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py b/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py
new file mode 100644
index 0000000000..d303699f90
--- /dev/null
+++ b/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py
@@ -0,0 +1,190 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+# model_cfg
+backbone_cfg = dict(
+    type='RGBPoseConv3D',
+    speed_ratio=4,
+    channel_ratio=4,
+    rgb_pathway=dict(
+        num_stages=4,
+        lateral=True,
+        lateral_infl=1,
+        lateral_activate=[0, 0, 1, 1],
+        fusion_kernel=7,
+        base_channels=64,
+        conv1_kernel=(1, 7, 7),
+        inflate=(0, 0, 1, 1),
+        with_pool2=False),
+    pose_pathway=dict(
+        num_stages=3,
+        stage_blocks=(4, 6, 3),
+        lateral=True,
+        lateral_inv=True,
+        lateral_infl=16,
+        lateral_activate=(0, 1, 1),
+        fusion_kernel=7,
+        in_channels=17,
+        base_channels=32,
+        out_indices=(2, ),
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_s=1,
+        conv1_stride_t=1,
+        pool1_stride_s=1,
+        pool1_stride_t=1,
+        inflate=(0, 1, 1),
+        spatial_strides=(2, 2, 2),
+        temporal_strides=(1, 1, 1),
+        dilations=(1, 1, 1),
+        with_pool2=False))
+head_cfg = dict(
+    type='RGBPoseHead',
+    num_classes=60,
+    in_channels=[2048, 512],
+    loss_components=['rgb', 'pose'],
+    loss_weights=[1., 1.],
+    average_clips='prob')
+data_preprocessor = dict(
+    type='MultiModalDataPreprocessor',
+    preprocessors=dict(
+        imgs=dict(
+            type='ActionDataPreprocessor',
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            format_shape='NCTHW'),
+        heatmap_imgs=dict(type='ActionDataPreprocessor')))
+model = dict(
+    type='MMRecognizer3D',
+    backbone=backbone_cfg,
+    cls_head=head_cfg,
+    data_preprocessor=data_preprocessor)
+
+dataset_type = 'PoseDataset'
+data_root = 'data/nturgbd_videos/'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
+right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
+train_pipeline = [
+    dict(
+        type='MMUniformSampleFrames',
+        clip_len=dict(RGB=8, Pose=32),
+        num_clips=1),
+    dict(type='MMDecode'),
+    dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+    dict(type='Resize', scale=(256, 256), keep_ratio=False),
+    dict(type='RandomResizedCrop', area_range=(0.56, 1.0)),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp),
+    dict(
+        type='GeneratePoseTarget',
+        sigma=0.7,
+        use_score=True,
+        with_kp=True,
+        with_limb=False,
+        scaling=0.25),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs'))
+]
+val_pipeline = [
+    dict(
+        type='MMUniformSampleFrames',
+        clip_len=dict(RGB=8, Pose=32),
+        num_clips=1,
+        test_mode=True),
+    dict(type='MMDecode'),
+    dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+    dict(type='Resize', scale=(256, 256), keep_ratio=False),
+    dict(
+        type='GeneratePoseTarget',
+        sigma=0.7,
+        use_score=True,
+        with_kp=True,
+        with_limb=False,
+        scaling=0.25),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs'))
+]
+test_pipeline = [
+    dict(
+        type='MMUniformSampleFrames',
+        clip_len=dict(RGB=8, Pose=32),
+        num_clips=10,
+        test_mode=True),
+    dict(type='MMDecode'),
+    dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+    dict(type='Resize', scale=(256, 256), keep_ratio=False),
+    dict(
+        type='GeneratePoseTarget',
+        sigma=0.7,
+        use_score=True,
+        with_kp=True,
+        with_limb=False,
+        scaling=0.25),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs'))
+]
+
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        data_prefix=dict(video=data_root),
+        split='xsub_train',
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        split='xsub_val',
+        data_prefix=dict(video=data_root),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        split='xsub_val',
+        data_prefix=dict(video=data_root),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.0075, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=20,
+        by_epoch=True,
+        milestones=[12, 16],
+        gamma=0.1)
+]
+
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_init_20230228-09b7684b.pth'  # noqa: E501
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (6 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py
index 123db1ee1f..e213e3319c 100644
--- a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py
+++ b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py
@@ -28,7 +28,7 @@
     test_cfg=None)
 
 dataset_type = 'PoseDataset'
-ann_file = 'data/posec3d/hmdb51.pkl'
+ann_file = 'data/skeleton/hmdb51_2d.pkl'
 left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
 right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
 train_pipeline = [
@@ -45,7 +45,7 @@
         use_score=True,
         with_kp=True,
         with_limb=False),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 val_pipeline = [
@@ -60,7 +60,7 @@
         use_score=True,
         with_kp=True,
         with_limb=False),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 test_pipeline = [
@@ -79,7 +79,7 @@
         double=True,
         left_kp=left_kp,
         right_kp=right_kp),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 
diff --git a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py
index 547f57c052..c100754fa5 100644
--- a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py
+++ b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py
@@ -28,7 +28,7 @@
     test_cfg=None)
 
 dataset_type = 'PoseDataset'
-ann_file = 'data/posec3d/ucf101.pkl'
+ann_file = 'data/skeleton/ucf101_2d.pkl'
 left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
 right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
 train_pipeline = [
@@ -45,7 +45,7 @@
         use_score=True,
         with_kp=True,
         with_limb=False),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 val_pipeline = [
@@ -60,7 +60,7 @@
         use_score=True,
         with_kp=True,
         with_limb=False),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 test_pipeline = [
@@ -79,7 +79,7 @@
         double=True,
         left_kp=left_kp,
         right_kp=right_kp),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 
diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py
index c893f69df3..8517870d1c 100644
--- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py
+++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py
@@ -23,13 +23,10 @@
         num_classes=99,
         spatial_type='avg',
         dropout_ratio=0.5,
-        average_clips='prob'),
-    train_cfg=None,
-    test_cfg=None)
+        average_clips='prob'))
 
 dataset_type = 'PoseDataset'
-ann_file_train = 'data/posec3d/gym_train.pkl'
-ann_file_val = 'data/posec3d/gym_val.pkl'
+ann_file = 'data/skeleton/gym_2d.pkl'
 left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
 right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
 train_pipeline = [
@@ -46,7 +43,7 @@
         use_score=True,
         with_kp=True,
         with_limb=False),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 val_pipeline = [
@@ -61,7 +58,7 @@
         use_score=True,
         with_kp=True,
         with_limb=False),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 test_pipeline = [
@@ -80,7 +77,7 @@
         double=True,
         left_kp=left_kp,
         right_kp=right_kp),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 
@@ -90,7 +87,13 @@
     persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
-        type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline))
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            split='train',
+            pipeline=train_pipeline)))
 val_dataloader = dict(
     batch_size=16,
     num_workers=8,
@@ -98,7 +101,8 @@
     sampler=dict(type='DefaultSampler', shuffle=False),
     dataset=dict(
         type=dataset_type,
-        ann_file=ann_file_val,
+        ann_file=ann_file,
+        split='val',
         pipeline=val_pipeline,
         test_mode=True))
 test_dataloader = dict(
@@ -108,7 +112,8 @@
     sampler=dict(type='DefaultSampler', shuffle=False),
     dataset=dict(
         type=dataset_type,
-        ann_file=ann_file_val,
+        ann_file=ann_file,
+        split='val',
         pipeline=test_pipeline,
         test_mode=True))
 
@@ -116,7 +121,7 @@
 test_evaluator = val_evaluator
 
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10)
+    type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
 val_cfg = dict(type='ValLoop')
 test_cfg = dict(type='TestLoop')
 
@@ -124,7 +129,7 @@
     dict(
         type='CosineAnnealingLR',
         eta_min=0,
-        T_max=240,
+        T_max=24,
         by_epoch=True,
         convert_to_iter_based=True)
 ]
@@ -132,5 +137,3 @@
 optim_wrapper = dict(
     optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003),
     clip_grad=dict(max_norm=40, norm_type=2))
-
-default_hooks = dict(checkpoint=dict(interval=10, max_keep_ckpts=3))
diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py
index 34764a726e..0ab9263951 100644
--- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py
+++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py
@@ -23,18 +23,17 @@
         num_classes=99,
         spatial_type='avg',
         dropout_ratio=0.5,
-        average_clips='prob'),
-    train_cfg=None,
-    test_cfg=None)
+        average_clips='prob'))
 
 dataset_type = 'PoseDataset'
-ann_file_train = 'data/posec3d/gym_train.pkl'
-ann_file_val = 'data/posec3d/gym_val.pkl'
+ann_file = 'data/skeleton/gym_2d.pkl'
 left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
 right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
 skeletons = [[0, 5], [0, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11],
              [11, 13], [13, 15], [6, 12], [12, 14], [14, 16], [0, 1], [0, 2],
              [1, 3], [2, 4], [11, 12]]
+left_limb = [0, 2, 3, 6, 7, 8, 12, 14]
+right_limb = [1, 4, 5, 9, 10, 11, 13, 15]
 train_pipeline = [
     dict(type='UniformSampleFrames', clip_len=48),
     dict(type='PoseDecode'),
@@ -50,7 +49,7 @@
         with_kp=False,
         with_limb=True,
         skeletons=skeletons),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 val_pipeline = [
@@ -66,7 +65,7 @@
         with_kp=False,
         with_limb=True,
         skeletons=skeletons),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 test_pipeline = [
@@ -85,8 +84,10 @@
         skeletons=skeletons,
         double=True,
         left_kp=left_kp,
-        right_kp=right_kp),
-    dict(type='FormatShape', input_format='NCTHW'),
+        right_kp=right_kp,
+        left_limb=left_limb,
+        right_limb=right_limb),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 
@@ -96,7 +97,13 @@
     persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
-        type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline))
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            split='train',
+            pipeline=train_pipeline))),
 val_dataloader = dict(
     batch_size=16,
     num_workers=8,
@@ -104,7 +111,8 @@
     sampler=dict(type='DefaultSampler', shuffle=False),
     dataset=dict(
         type=dataset_type,
-        ann_file=ann_file_val,
+        ann_file=ann_file,
+        split='val',
         pipeline=val_pipeline,
         test_mode=True))
 test_dataloader = dict(
@@ -114,7 +122,8 @@
     sampler=dict(type='DefaultSampler', shuffle=False),
     dataset=dict(
         type=dataset_type,
-        ann_file=ann_file_val,
+        ann_file=ann_file,
+        split='val',
         pipeline=test_pipeline,
         test_mode=True))
 
@@ -122,7 +131,7 @@
 test_evaluator = val_evaluator
 
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10)
+    type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
 val_cfg = dict(type='ValLoop')
 test_cfg = dict(type='TestLoop')
 
@@ -130,7 +139,7 @@
     dict(
         type='CosineAnnealingLR',
         eta_min=0,
-        T_max=240,
+        T_max=24,
         by_epoch=True,
         convert_to_iter_based=True)
 ]
@@ -138,5 +147,3 @@
 optim_wrapper = dict(
     optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003),
     clip_grad=dict(max_norm=40, norm_type=2))
-
-default_hooks = dict(checkpoint=dict(interval=10, max_keep_ckpts=3))
diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py
index 2194139f5e..c4915d4d2e 100644
--- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py
+++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py
@@ -21,15 +21,11 @@
         type='I3DHead',
         in_channels=512,
         num_classes=60,
-        spatial_type='avg',
         dropout_ratio=0.5,
-        average_clips='prob'),
-    train_cfg=None,
-    test_cfg=None)
+        average_clips='prob'))
 
 dataset_type = 'PoseDataset'
-ann_file_train = 'data/posec3d/ntu60_xsub_train.pkl'
-ann_file_val = 'data/posec3d/ntu60_xsub_val.pkl'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
 left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
 right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
 train_pipeline = [
@@ -46,7 +42,7 @@
         use_score=True,
         with_kp=True,
         with_limb=False),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 val_pipeline = [
@@ -61,7 +57,7 @@
         use_score=True,
         with_kp=True,
         with_limb=False),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 test_pipeline = [
@@ -80,7 +76,7 @@
         double=True,
         left_kp=left_kp,
         right_kp=right_kp),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 
@@ -90,7 +86,13 @@
     persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
-        type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline))
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            split='xsub_train',
+            pipeline=train_pipeline)))
 val_dataloader = dict(
     batch_size=16,
     num_workers=8,
@@ -98,7 +100,8 @@
     sampler=dict(type='DefaultSampler', shuffle=False),
     dataset=dict(
         type=dataset_type,
-        ann_file=ann_file_val,
+        ann_file=ann_file,
+        split='xsub_val',
         pipeline=val_pipeline,
         test_mode=True))
 test_dataloader = dict(
@@ -108,15 +111,16 @@
     sampler=dict(type='DefaultSampler', shuffle=False),
     dataset=dict(
         type=dataset_type,
-        ann_file=ann_file_val,
+        ann_file=ann_file,
+        split='xsub_val',
         pipeline=test_pipeline,
         test_mode=True))
 
-val_evaluator = dict(type='AccMetric')
+val_evaluator = [dict(type='AccMetric')]
 test_evaluator = val_evaluator
 
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10)
+    type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
 val_cfg = dict(type='ValLoop')
 test_cfg = dict(type='TestLoop')
 
@@ -124,7 +128,7 @@
     dict(
         type='CosineAnnealingLR',
         eta_min=0,
-        T_max=240,
+        T_max=24,
         by_epoch=True,
         convert_to_iter_based=True)
 ]
@@ -132,5 +136,3 @@
 optim_wrapper = dict(
     optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003),
     clip_grad=dict(max_norm=40, norm_type=2))
-
-default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py
index 7eca1463ee..0f4f11f3a0 100644
--- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py
+++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py
@@ -21,20 +21,18 @@
         type='I3DHead',
         in_channels=512,
         num_classes=60,
-        spatial_type='avg',
         dropout_ratio=0.5,
-        average_clips='prob'),
-    train_cfg=None,
-    test_cfg=None)
+        average_clips='prob'))
 
 dataset_type = 'PoseDataset'
-ann_file_train = 'data/posec3d/ntu60_xsub_train.pkl'
-ann_file_val = 'data/posec3d/ntu60_xsub_val.pkl'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
 left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
 right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
 skeletons = [[0, 5], [0, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11],
              [11, 13], [13, 15], [6, 12], [12, 14], [14, 16], [0, 1], [0, 2],
              [1, 3], [2, 4], [11, 12]]
+left_limb = [0, 2, 3, 6, 7, 8, 12, 14]
+right_limb = [1, 4, 5, 9, 10, 11, 13, 15]
 train_pipeline = [
     dict(type='UniformSampleFrames', clip_len=48),
     dict(type='PoseDecode'),
@@ -50,7 +48,7 @@
         with_kp=False,
         with_limb=True,
         skeletons=skeletons),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 val_pipeline = [
@@ -66,7 +64,7 @@
         with_kp=False,
         with_limb=True,
         skeletons=skeletons),
-    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 test_pipeline = [
@@ -84,9 +82,9 @@
         with_limb=True,
         skeletons=skeletons,
         double=True,
-        left_kp=left_kp,
-        right_kp=right_kp),
-    dict(type='FormatShape', input_format='NCTHW'),
+        left_limb=left_limb,
+        right_limb=right_limb),
+    dict(type='FormatShape', input_format='NCTHW_Heatmap'),
     dict(type='PackActionInputs')
 ]
 
@@ -96,7 +94,13 @@
     persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
-        type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline))
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            split='xsub_train',
+            pipeline=train_pipeline)))
 val_dataloader = dict(
     batch_size=16,
     num_workers=8,
@@ -104,7 +108,8 @@
     sampler=dict(type='DefaultSampler', shuffle=False),
     dataset=dict(
         type=dataset_type,
-        ann_file=ann_file_val,
+        ann_file=ann_file,
+        split='xsub_val',
         pipeline=val_pipeline,
         test_mode=True))
 test_dataloader = dict(
@@ -114,15 +119,16 @@
     sampler=dict(type='DefaultSampler', shuffle=False),
     dataset=dict(
         type=dataset_type,
-        ann_file=ann_file_val,
+        ann_file=ann_file,
+        split='xsub_val',
         pipeline=test_pipeline,
         test_mode=True))
 
-val_evaluator = dict(type='AccMetric')
+val_evaluator = [dict(type='AccMetric')]
 test_evaluator = val_evaluator
 
 train_cfg = dict(
-    type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10)
+    type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
 val_cfg = dict(type='ValLoop')
 test_cfg = dict(type='TestLoop')
 
@@ -130,7 +136,7 @@
     dict(
         type='CosineAnnealingLR',
         eta_min=0,
-        T_max=240,
+        T_max=24,
         by_epoch=True,
         convert_to_iter_based=True)
 ]
@@ -138,5 +144,3 @@
 optim_wrapper = dict(
     optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003),
     clip_grad=dict(max_norm=40, norm_type=2))
-
-default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
diff --git a/mmaction/datasets/pose_dataset.py b/mmaction/datasets/pose_dataset.py
index 52c2c0b668..a06a7f7c0d 100644
--- a/mmaction/datasets/pose_dataset.py
+++ b/mmaction/datasets/pose_dataset.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Callable, List, Optional, Union
+import os.path as osp
+from typing import Callable, Dict, List, Optional, Union
 
-from mmengine.fileio import exists, load
+import mmengine
 
 from mmaction.registry import DATASETS
-from mmaction.utils import ConfigType
 from .base import BaseActionDataset
 
 
@@ -21,38 +21,44 @@ class PoseDataset(BaseActionDataset):
 
     Args:
         ann_file (str): Path to the annotation file.
-        pipeline (list): A sequence of data transforms.
-        split (str, optional): The dataset split used. Only applicable to
-            ``UCF`` or ``HMDB``. Allowed choices are ``train1``, ``test1``,
-            ``train2``, ``test2``, ``train3``, ``test3``. Defaults to None.
-        start_index (int): Specify a start index for frames in consideration of
-            different filename format. Defaults to 0.
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        split (str, optional): The dataset split used. For UCF101 and
+            HMDB51, allowed choices are 'train1', 'test1', 'train2',
+            'test2', 'train3', 'test3'. For NTURGB+D, allowed choices
+            are 'xsub_train', 'xsub_val', 'xview_train', 'xview_val'.
+            For NTURGB+D 120, allowed choices are 'xsub_train',
+            'xsub_val', 'xset_train', 'xset_val'. For FineGYM,
+            allowed choices are 'train', 'val'. Defaults to None.
     """
 
     def __init__(self,
                  ann_file: str,
-                 pipeline: List[Union[ConfigType, Callable]],
+                 pipeline: List[Union[Dict, Callable]],
                  split: Optional[str] = None,
-                 start_index: int = 0,
                  **kwargs) -> None:
-        # split, applicable to ``ucf101`` or ``hmdb51``
         self.split = split
         super().__init__(
-            ann_file,
-            pipeline=pipeline,
-            start_index=start_index,
-            modality='Pose',
-            **kwargs)
+            ann_file, pipeline=pipeline, modality='Pose', **kwargs)
 
-    def load_data_list(self) -> List[dict]:
+    def load_data_list(self) -> List[Dict]:
         """Load annotation file to get skeleton information."""
         assert self.ann_file.endswith('.pkl')
-        exists(self.ann_file)
-        data_list = load(self.ann_file)
+        mmengine.exists(self.ann_file)
+        data_list = mmengine.load(self.ann_file)
 
         if self.split is not None:
-            split, data = data_list['split'], data_list['annotations']
-            identifier = 'filename' if 'filename' in data[0] else 'frame_dir'
-            data_list = [x for x in data if x[identifier] in split[self.split]]
+            split, annos = data_list['split'], data_list['annotations']
+            identifier = 'filename' if 'filename' in annos[0] else 'frame_dir'
+            split = set(split[self.split])
+            data_list = [x for x in annos if x[identifier] in split]
 
+        # Sometimes we may need to load video from the file
+        if 'video' in self.data_prefix:
+            for item in data_list:
+                if 'filename' in item:
+                    item['filename'] = osp.join(self.data_prefix['video'],
+                                                item['filename'])
+                if 'frame_dir' in item:
+                    item['frame_dir'] = osp.join(self.data_prefix['video'],
+                                                 item['frame_dir'])
         return data_list
diff --git a/mmaction/datasets/transforms/__init__.py b/mmaction/datasets/transforms/__init__.py
index 198bd8c781..2b83c415f5 100644
--- a/mmaction/datasets/transforms/__init__.py
+++ b/mmaction/datasets/transforms/__init__.py
@@ -11,7 +11,8 @@
                       PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames,
                       UniformSample, UntrimmedSampleFrames)
 from .pose_transforms import (GeneratePoseTarget, GenSkeFeat, JointToBone,
-                              LoadKineticsPose, MergeSkeFeat, PadTo,
+                              LoadKineticsPose, MergeSkeFeat, MMCompact,
+                              MMDecode, MMUniformSampleFrames, PadTo,
                               PoseCompact, PoseDecode, PreNormalize2D,
                               PreNormalize3D, ToMotion, UniformSampleFrames)
 from .processing import (AudioAmplify, CenterCrop, ColorJitter, Flip, Fuse,
@@ -36,5 +37,6 @@
     'RandomCrop', 'RandomRescale', 'RandomResizedCrop', 'RawFrameDecode',
     'Resize', 'SampleAVAFrames', 'SampleFrames', 'TenCrop', 'ThreeCrop',
     'ToMotion', 'TorchVisionWrapper', 'Transpose', 'UniformSample',
-    'UniformSampleFrames', 'UntrimmedSampleFrames'
+    'UniformSampleFrames', 'UntrimmedSampleFrames', 'MMUniformSampleFrames',
+    'MMDecode', 'MMCompact'
 ]
diff --git a/mmaction/datasets/transforms/formatting.py b/mmaction/datasets/transforms/formatting.py
index 7477f51080..bdcc75ffb5 100644
--- a/mmaction/datasets/transforms/formatting.py
+++ b/mmaction/datasets/transforms/formatting.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, Sequence
+from typing import Dict, Optional, Sequence, Tuple
 
 import numpy as np
 import torch
@@ -38,9 +38,11 @@ class PackActionInputs(BaseTransform):
 
     def __init__(
         self,
+        collect_keys: Optional[Tuple[str]] = None,
         meta_keys: Sequence[str] = ('img_shape', 'img_key', 'video_id',
                                     'timestamp')
     ) -> None:
+        self.collect_keys = collect_keys
         self.meta_keys = meta_keys
 
     def transform(self, results: Dict) -> Dict:
@@ -53,19 +55,27 @@ def transform(self, results: Dict) -> Dict:
             dict: The result dict.
         """
         packed_results = dict()
-        if 'imgs' in results:
-            imgs = results['imgs']
-            packed_results['inputs'] = to_tensor(imgs)
-        elif 'keypoint' in results:
-            keypoint = results['keypoint']
-            packed_results['inputs'] = to_tensor(keypoint)
-        elif 'audios' in results:
-            audios = results['audios']
-            packed_results['inputs'] = to_tensor(audios)
+        if self.collect_keys is not None:
+            packed_results['inputs'] = dict()
+            for key in self.collect_keys:
+                packed_results['inputs'][key] = to_tensor(results[key])
         else:
-            raise ValueError(
-                'Cannot get `imgs`, `keypoint` or `audios` in the input dict '
-                'of `PackActionInputs`.')
+            if 'imgs' in results:
+                imgs = results['imgs']
+                packed_results['inputs'] = to_tensor(imgs)
+            elif 'heatmap_imgs' in results:
+                heatmap_imgs = results['heatmap_imgs']
+                packed_results['inputs'] = to_tensor(heatmap_imgs)
+            elif 'keypoint' in results:
+                keypoint = results['keypoint']
+                packed_results['inputs'] = to_tensor(keypoint)
+            elif 'audios' in results:
+                audios = results['audios']
+                packed_results['inputs'] = to_tensor(audios)
+            else:
+                raise ValueError(
+                    'Cannot get `imgs`, `keypoint`, `heatmap_imgs` '
+                    'or `audios` in the input dict of `PackActionInputs`.')
 
         data_sample = ActionDataSample()
 
@@ -91,7 +101,8 @@ def transform(self, results: Dict) -> Dict:
 
     def __repr__(self) -> str:
         repr_str = self.__class__.__name__
-        repr_str += f'(meta_keys={self.meta_keys})'
+        repr_str += f'(collect_keys={self.collect_keys}, '
+        repr_str += f'meta_keys={self.meta_keys})'
         return repr_str
 
 
@@ -178,16 +189,20 @@ class FormatShape(BaseTransform):
     """Format final imgs shape to the given input_format.
 
     Required keys:
-        - imgs
+        - imgs (optional)
+        - heatmap_imgs (optional)
         - num_clips
         - clip_len
 
     Modified Keys:
-        - img
-        - input_shape
+        - imgs (optional)
+        - input_shape (optional)
+
+    Added Keys:
+        - heatmap_input_shape (optional)
 
     Args:
-        input_format (str): Define the final imgs format.
+        input_format (str): Define the final data format.
         collapse (bool): To collapse input_format N... to ... (NCTHW to CTHW,
             etc.) if N is 1. Should be set as True when training and testing
             detectors. Defaults to False.
@@ -196,11 +211,13 @@ class FormatShape(BaseTransform):
     def __init__(self, input_format: str, collapse: bool = False) -> None:
         self.input_format = input_format
         self.collapse = collapse
-        if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']:
+        if self.input_format not in [
+                'NCTHW', 'NCHW', 'NCHW_Flow', 'NCTHW_Heatmap', 'NPTCHW'
+        ]:
             raise ValueError(
                 f'The input format {self.input_format} is invalid.')
 
-    def transform(self, results: dict) -> dict:
+    def transform(self, results: Dict) -> Dict:
         """Performs the FormatShape formatting.
 
         Args:
@@ -209,26 +226,69 @@ def transform(self, results: dict) -> dict:
         """
         if not isinstance(results['imgs'], np.ndarray):
             results['imgs'] = np.array(results['imgs'])
-        imgs = results['imgs']
+
         # [M x H x W x C]
         # M = 1 * N_crops * N_clips * T
         if self.collapse:
             assert results['num_clips'] == 1
 
         if self.input_format == 'NCTHW':
+            if 'imgs' in results:
+                imgs = results['imgs']
+                num_clips = results['num_clips']
+                clip_len = results['clip_len']
+                if isinstance(clip_len, dict):
+                    clip_len = clip_len['RGB']
+
+                imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+                # N_crops x N_clips x T x H x W x C
+                imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))
+                # N_crops x N_clips x C x T x H x W
+                imgs = imgs.reshape((-1, ) + imgs.shape[2:])
+                # M' x C x T x H x W
+                # M' = N_crops x N_clips
+                results['imgs'] = imgs
+                results['input_shape'] = imgs.shape
+
+            if 'heatmap_imgs' in results:
+                imgs = results['heatmap_imgs']
+                num_clips = results['num_clips']
+                clip_len = results['clip_len']
+                # clip_len must be a dict
+                clip_len = clip_len['Pose']
+
+                imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+                # N_crops x N_clips x T x C x H x W
+                imgs = np.transpose(imgs, (0, 1, 3, 2, 4, 5))
+                # N_crops x N_clips x C x T x H x W
+                imgs = imgs.reshape((-1, ) + imgs.shape[2:])
+                # M' x C x T x H x W
+                # M' = N_crops x N_clips
+                results['heatmap_imgs'] = imgs
+                results['heatmap_input_shape'] = imgs.shape
+
+        elif self.input_format == 'NCTHW_Heatmap':
             num_clips = results['num_clips']
             clip_len = results['clip_len']
+            imgs = results['imgs']
 
             imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
-            # N_crops x N_clips x T x H x W x C
-            imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))
+            # N_crops x N_clips x T x C x H x W
+            imgs = np.transpose(imgs, (0, 1, 3, 2, 4, 5))
             # N_crops x N_clips x C x T x H x W
             imgs = imgs.reshape((-1, ) + imgs.shape[2:])
             # M' x C x T x H x W
             # M' = N_crops x N_clips
+            results['imgs'] = imgs
+            results['input_shape'] = imgs.shape
+
         elif self.input_format == 'NCHW':
+            imgs = results['imgs']
             imgs = np.transpose(imgs, (0, 3, 1, 2))
             # M x C x H x W
+            results['imgs'] = imgs
+            results['input_shape'] = imgs.shape
+
         elif self.input_format == 'NCHW_Flow':
             num_imgs = len(results['imgs'])
             assert num_imgs % 2 == 0
@@ -252,26 +312,31 @@ def transform(self, results: dict) -> dict:
             # M' x C' x H x W
             # M' = N_crops x N_clips
             # C' = T x C
+            results['imgs'] = imgs
+            results['input_shape'] = imgs.shape
+
         elif self.input_format == 'NPTCHW':
             num_proposals = results['num_proposals']
             num_clips = results['num_clips']
             clip_len = results['clip_len']
+            imgs = results['imgs']
             imgs = imgs.reshape((num_proposals, num_clips * clip_len) +
                                 imgs.shape[1:])
             # P x M x H x W x C
             # M = N_clips x T
             imgs = np.transpose(imgs, (0, 1, 4, 2, 3))
             # P x M x C x H x W
+            results['imgs'] = imgs
+            results['input_shape'] = imgs.shape
 
         if self.collapse:
-            assert imgs.shape[0] == 1
-            imgs = imgs.squeeze(0)
+            assert results['imgs'].shape[0] == 1
+            results['imgs'] = results['imgs'].squeeze(0)
+            results['input_shape'] = results['imgs'].shape
 
-        results['imgs'] = imgs
-        results['input_shape'] = imgs.shape
         return results
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         repr_str = self.__class__.__name__
         repr_str += f"(input_format='{self.input_format}')"
         return repr_str
diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py
index 9e66cd7f3f..10309b2516 100644
--- a/mmaction/datasets/transforms/loading.py
+++ b/mmaction/datasets/transforms/loading.py
@@ -4,7 +4,7 @@
 import os
 import os.path as osp
 import shutil
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union
 
 import mmcv
 import numpy as np
@@ -1077,29 +1077,35 @@ class DecordInit(BaseTransform):
 
     Decord: https://github.com/dmlc/decord
 
-    Required keys are "filename",
-    added or modified keys are "video_reader" and "total_frames".
+    Required Keys:
+
+        - filename
+
+    Added Keys:
+
+        - video_reader
+        - total_frames
+        - fps
 
     Args:
         io_backend (str): io backend where frames are store.
-            Default: 'disk'.
-        num_threads (int): Number of thread to decode the video. Default: 1.
+            Defaults to ``'disk'``.
+        num_threads (int): Number of thread to decode the video. Defaults to 1.
         kwargs (dict): Args for file client.
     """
 
-    def __init__(self, io_backend='disk', num_threads=1, **kwargs):
+    def __init__(self,
+                 io_backend: str = 'disk',
+                 num_threads: int = 1,
+                 **kwargs) -> None:
         self.io_backend = io_backend
         self.num_threads = num_threads
         self.kwargs = kwargs
         self.file_client = None
 
-    def transform(self, results):
-        """Perform the Decord initialization.
-
-        Args:
-            results (dict): The resulting dict to be modified and passed
-                to the next transform in pipeline.
-        """
+    def _get_video_reader(self, filename: str) -> object:
+        if osp.splitext(filename)[0] == filename:
+            filename = filename + '.mp4'
         try:
             import decord
         except ImportError:
@@ -1108,15 +1114,27 @@ def transform(self, results):
 
         if self.file_client is None:
             self.file_client = FileClient(self.io_backend, **self.kwargs)
-
-        file_obj = io.BytesIO(self.file_client.get(results['filename']))
+        file_obj = io.BytesIO(self.file_client.get(filename))
         container = decord.VideoReader(file_obj, num_threads=self.num_threads)
-        results['avg_fps'] = container.get_avg_fps()
-        results['video_reader'] = container
+        return container
+
+    def transform(self, results: Dict) -> Dict:
+        """Perform the Decord initialization.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        container = self._get_video_reader(results['filename'])
         results['total_frames'] = len(container)
+
+        results['video_reader'] = container
+        results['avg_fps'] = container.get_avg_fps()
         return results
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         repr_str = (f'{self.__class__.__name__}('
                     f'io_backend={self.io_backend}, '
                     f'num_threads={self.num_threads})')
@@ -1129,35 +1147,32 @@ class DecordDecode(BaseTransform):
 
     Decord: https://github.com/dmlc/decord
 
-    Required keys are "video_reader", "filename" and "frame_inds",
-    added or modified keys are "imgs" and "original_shape".
+    Required Keys:
+
+        - video_reader
+        - frame_inds
+
+    Added Keys:
+
+        - imgs
+        - original_shape
+        - img_shape
 
     Args:
         mode (str): Decoding mode. Options are 'accurate' and 'efficient'.
             If set to 'accurate', it will decode videos into accurate frames.
             If set to 'efficient', it will adopt fast seeking but only return
             key frames, which may be duplicated and inaccurate, and more
-            suitable for large scene-based video datasets. Default: 'accurate'.
+            suitable for large scene-based video datasets.
+            Defaults to ``'accurate'``.
     """
 
-    def __init__(self, mode='accurate'):
+    def __init__(self, mode: str = 'accurate') -> None:
         self.mode = mode
         assert mode in ['accurate', 'efficient']
 
-    def transform(self, results):
-        """Perform the Decord decoding.
-
-        Args:
-            results (dict): The resulting dict to be modified and passed
-                to the next transform in pipeline.
-        """
-        container = results['video_reader']
-
-        if results['frame_inds'].ndim != 1:
-            results['frame_inds'] = np.squeeze(results['frame_inds'])
-
-        frame_inds = results['frame_inds']
-
+    def _decord_load_frames(self, container: object,
+                            frame_inds: np.ndarray) -> List[np.ndarray]:
         if self.mode == 'accurate':
             imgs = container.get_batch(frame_inds).asnumpy()
             imgs = list(imgs)
@@ -1169,6 +1184,24 @@ def transform(self, results):
                 container.seek(idx)
                 frame = container.next()
                 imgs.append(frame.asnumpy())
+        return imgs
+
+    def transform(self, results: Dict) -> Dict:
+        """Perform the Decord decoding.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        container = results['video_reader']
+
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        frame_inds = results['frame_inds']
+        imgs = self._decord_load_frames(container, frame_inds)
 
         results['video_reader'] = None
         del container
@@ -1179,7 +1212,7 @@ def transform(self, results):
 
         return results
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         repr_str = f'{self.__class__.__name__}(mode={self.mode})'
         return repr_str
 
diff --git a/mmaction/datasets/transforms/pose_transforms.py b/mmaction/datasets/transforms/pose_transforms.py
index cff9f90112..76e09dacd8 100644
--- a/mmaction/datasets/transforms/pose_transforms.py
+++ b/mmaction/datasets/transforms/pose_transforms.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy as cp
 import pickle
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 from mmcv.transforms import BaseTransform, KeyMapper
@@ -11,7 +11,8 @@
 from torch.nn.modules.utils import _pair
 
 from mmaction.registry import TRANSFORMS
-from .processing import Flip, _combine_quadruple
+from .loading import DecordDecode, DecordInit
+from .processing import _combine_quadruple
 
 
 @TRANSFORMS.register_module()
@@ -172,42 +173,65 @@ def __repr__(self):
 class GeneratePoseTarget(BaseTransform):
     """Generate pseudo heatmaps based on joint coordinates and confidence.
 
-    Required keys are "keypoint", "img_shape", "keypoint_score" (optional),
-    added or modified keys are "imgs".
+    Required Keys:
+
+        - keypoint
+        - keypoint_score (optional)
+        - img_shape
+
+    Added Keys:
+
+        - imgs (optional)
+        - heatmap_imgs (optional)
 
     Args:
-        sigma (float): The sigma of the generated gaussian map. Default: 0.6.
+        sigma (float): The sigma of the generated gaussian map.
+            Defaults to 0.6.
         use_score (bool): Use the confidence score of keypoints as the maximum
-            of the gaussian maps. Default: True.
-        with_kp (bool): Generate pseudo heatmaps for keypoints. Default: True.
+            of the gaussian maps. Defaults to True.
+        with_kp (bool): Generate pseudo heatmaps for keypoints.
+            Defaults to True.
         with_limb (bool): Generate pseudo heatmaps for limbs. At least one of
-            'with_kp' and 'with_limb' should be True. Default: False.
+            'with_kp' and 'with_limb' should be True. Defaults to False.
         skeletons (tuple[tuple]): The definition of human skeletons.
-            Default: ((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), (7, 9),
-                      (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), (13, 15),
-                      (6, 12), (12, 14), (14, 16), (11, 12)),
+            Defaults to ``((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7),
+                         (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13),
+                         (13, 15), (6, 12), (12, 14), (14, 16), (11, 12))``,
             which is the definition of COCO-17p skeletons.
         double (bool): Output both original heatmaps and flipped heatmaps.
-            Default: False.
+            Defaults to False.
         left_kp (tuple[int]): Indexes of left keypoints, which is used when
-            flipping heatmaps. Default: (1, 3, 5, 7, 9, 11, 13, 15),
+            flipping heatmaps. Defaults to (1, 3, 5, 7, 9, 11, 13, 15),
             which is left keypoints in COCO-17p.
         right_kp (tuple[int]): Indexes of right keypoints, which is used when
-            flipping heatmaps. Default: (2, 4, 6, 8, 10, 12, 14, 16),
+            flipping heatmaps. Defaults to (2, 4, 6, 8, 10, 12, 14, 16),
             which is right keypoints in COCO-17p.
+        left_limb (tuple[int]): Indexes of left limbs, which is used when
+            flipping heatmaps. Defaults to (0, 2, 4, 5, 6, 10, 11, 12),
+            which is left limbs of skeletons we defined for COCO-17p.
+        right_limb (tuple[int]): Indexes of right limbs, which is used when
+            flipping heatmaps. Defaults to (1, 3, 7, 8, 9, 13, 14, 15),
+            which is right limbs of skeletons we defined for COCO-17p.
+        scaling (float): The ratio to scale the heatmaps. Defaults to 1.
     """
 
     def __init__(self,
-                 sigma=0.6,
-                 use_score=True,
-                 with_kp=True,
-                 with_limb=False,
-                 skeletons=((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7),
-                            (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13),
-                            (13, 15), (6, 12), (12, 14), (14, 16), (11, 12)),
-                 double=False,
-                 left_kp=(1, 3, 5, 7, 9, 11, 13, 15),
-                 right_kp=(2, 4, 6, 8, 10, 12, 14, 16)):
+                 sigma: float = 0.6,
+                 use_score: bool = True,
+                 with_kp: bool = True,
+                 with_limb: bool = False,
+                 skeletons: Tuple[Tuple[int]] = ((0, 1), (0, 2), (1, 3),
+                                                 (2, 4), (0, 5), (5, 7),
+                                                 (7, 9), (0, 6), (6, 8),
+                                                 (8, 10), (5, 11), (11, 13),
+                                                 (13, 15), (6, 12), (12, 14),
+                                                 (14, 16), (11, 12)),
+                 double: bool = False,
+                 left_kp: Tuple[int] = (1, 3, 5, 7, 9, 11, 13, 15),
+                 right_kp: Tuple[int] = (2, 4, 6, 8, 10, 12, 14, 16),
+                 left_limb: Tuple[int] = (0, 2, 4, 5, 6, 10, 11, 12),
+                 right_limb: Tuple[int] = (1, 3, 7, 8, 9, 13, 14, 15),
+                 scaling: float = 1.) -> None:
 
         self.sigma = sigma
         self.use_score = use_score
@@ -224,29 +248,30 @@ def __init__(self,
         self.left_kp = left_kp
         self.right_kp = right_kp
         self.skeletons = skeletons
+        self.left_limb = left_limb
+        self.right_limb = right_limb
+        self.scaling = scaling
 
-    def generate_a_heatmap(self, img_h, img_w, centers, sigma, max_values):
+    def generate_a_heatmap(self, arr: np.ndarray, centers: np.ndarray,
+                           max_values: np.ndarray) -> None:
         """Generate pseudo heatmap for one keypoint in one frame.
 
         Args:
-            img_h (int): The height of the heatmap.
-            img_w (int): The width of the heatmap.
+            arr (np.ndarray): The array to store the generated heatmaps.
+                Shape: img_h * img_w.
             centers (np.ndarray): The coordinates of corresponding keypoints
-                (of multiple persons).
-            sigma (float): The sigma of generated gaussian.
-            max_values (np.ndarray): The max values of each keypoint.
-
-        Returns:
-            np.ndarray: The generated pseudo heatmap.
+                (of multiple persons). Shape: M * 2.
+            max_values (np.ndarray): The max values of each keypoint. Shape: M.
         """
 
-        heatmap = np.zeros([img_h, img_w], dtype=np.float32)
+        sigma = self.sigma
+        img_h, img_w = arr.shape
 
         for center, max_value in zip(centers, max_values):
-            mu_x, mu_y = center[0], center[1]
             if max_value < self.eps:
                 continue
 
+            mu_x, mu_y = center[0], center[1]
             st_x = max(int(mu_x - 3 * sigma), 0)
             ed_x = min(int(mu_x + 3 * sigma) + 1, img_w)
             st_y = max(int(mu_y - 3 * sigma), 0)
@@ -261,34 +286,29 @@ def generate_a_heatmap(self, img_h, img_w, centers, sigma, max_values):
 
             patch = np.exp(-((x - mu_x)**2 + (y - mu_y)**2) / 2 / sigma**2)
             patch = patch * max_value
-            heatmap[st_y:ed_y,
-                    st_x:ed_x] = np.maximum(heatmap[st_y:ed_y, st_x:ed_x],
-                                            patch)
-
-        return heatmap
+            arr[st_y:ed_y, st_x:ed_x] = \
+                np.maximum(arr[st_y:ed_y, st_x:ed_x], patch)
 
-    def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma,
-                                start_values, end_values):
+    def generate_a_limb_heatmap(self, arr: np.ndarray, starts: np.ndarray,
+                                ends: np.ndarray, start_values: np.ndarray,
+                                end_values: np.ndarray) -> None:
         """Generate pseudo heatmap for one limb in one frame.
 
         Args:
-            img_h (int): The height of the heatmap.
-            img_w (int): The width of the heatmap.
+            arr (np.ndarray): The array to store the generated heatmaps.
+                Shape: img_h * img_w.
             starts (np.ndarray): The coordinates of one keypoint in the
-                corresponding limbs (of multiple persons).
+                corresponding limbs. Shape: M * 2.
             ends (np.ndarray): The coordinates of the other keypoint in the
-                corresponding limbs (of multiple persons).
-            sigma (float): The sigma of generated gaussian.
+                corresponding limbs. Shape: M * 2.
             start_values (np.ndarray): The max values of one keypoint in the
-                corresponding limbs.
-            end_values (np.ndarray): The max values of the other keypoint in
-                the corresponding limbs.
-
-        Returns:
-            np.ndarray: The generated pseudo heatmap.
+                corresponding limbs. Shape: M.
+            end_values (np.ndarray): The max values of the other keypoint
+                in the corresponding limbs. Shape: M.
         """
 
-        heatmap = np.zeros([img_h, img_w], dtype=np.float32)
+        sigma = self.sigma
+        img_h, img_w = arr.shape
 
         for start, end, start_value, end_value in zip(starts, ends,
                                                       start_values,
@@ -325,9 +345,7 @@ def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma,
             d2_ab = ((start[0] - end[0])**2 + (start[1] - end[1])**2)
 
             if d2_ab < 1:
-                full_map = self.generate_a_heatmap(img_h, img_w, [start],
-                                                   sigma, [start_value])
-                heatmap = np.maximum(heatmap, full_map)
+                self.generate_a_heatmap(arr, start[None], start_value[None])
                 continue
 
             coeff = (d2_start - d2_end + d2_ab) / 2. / d2_ab
@@ -348,61 +366,50 @@ def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma,
             patch = np.exp(-d2_seg / 2. / sigma**2)
             patch = patch * value_coeff
 
-            heatmap[min_y:max_y, min_x:max_x] = np.maximum(
-                heatmap[min_y:max_y, min_x:max_x], patch)
-
-        return heatmap
+            arr[min_y:max_y, min_x:max_x] = \
+                np.maximum(arr[min_y:max_y, min_x:max_x], patch)
 
-    def generate_heatmap(self, img_h, img_w, kps, sigma, max_values):
+    def generate_heatmap(self, arr: np.ndarray, kps: np.ndarray,
+                         max_values: np.ndarray) -> None:
         """Generate pseudo heatmap for all keypoints and limbs in one frame (if
         needed).
 
         Args:
-            img_h (int): The height of the heatmap.
-            img_w (int): The width of the heatmap.
+            arr (np.ndarray): The array to store the generated heatmaps.
+                Shape: V * img_h * img_w.
             kps (np.ndarray): The coordinates of keypoints in this frame.
-            sigma (float): The sigma of generated gaussian.
+                Shape: M * V * 2.
             max_values (np.ndarray): The confidence score of each keypoint.
-
-        Returns:
-            np.ndarray: The generated pseudo heatmap.
+                Shape: M * V.
         """
 
-        heatmaps = []
         if self.with_kp:
             num_kp = kps.shape[1]
             for i in range(num_kp):
-                heatmap = self.generate_a_heatmap(img_h, img_w, kps[:, i],
-                                                  sigma, max_values[:, i])
-                heatmaps.append(heatmap)
+                self.generate_a_heatmap(arr[i], kps[:, i], max_values[:, i])
 
         if self.with_limb:
-            for limb in self.skeletons:
+            for i, limb in enumerate(self.skeletons):
                 start_idx, end_idx = limb
                 starts = kps[:, start_idx]
                 ends = kps[:, end_idx]
 
                 start_values = max_values[:, start_idx]
                 end_values = max_values[:, end_idx]
-                heatmap = self.generate_a_limb_heatmap(img_h, img_w, starts,
-                                                       ends, sigma,
-                                                       start_values,
-                                                       end_values)
-                heatmaps.append(heatmap)
-
-        return np.stack(heatmaps, axis=-1)
+                self.generate_a_limb_heatmap(arr[i], starts, ends,
+                                             start_values, end_values)
 
-    def gen_an_aug(self, results):
+    def gen_an_aug(self, results: Dict) -> np.ndarray:
         """Generate pseudo heatmaps for all frames.
 
         Args:
             results (dict): The dictionary that contains all info of a sample.
 
         Returns:
-            list[np.ndarray]: The generated pseudo heatmaps.
+            np.ndarray: The generated pseudo heatmaps.
         """
 
-        all_kps = results['keypoint']
+        all_kps = results['keypoint'].astype(np.float32)
         kp_shape = all_kps.shape
 
         if 'keypoint_score' in results:
@@ -411,43 +418,54 @@ def gen_an_aug(self, results):
             all_kpscores = np.ones(kp_shape[:-1], dtype=np.float32)
 
         img_h, img_w = results['img_shape']
+
+        # scale img_h, img_w and kps
+        img_h = int(img_h * self.scaling + 0.5)
+        img_w = int(img_w * self.scaling + 0.5)
+        all_kps[..., :2] *= self.scaling
+
         num_frame = kp_shape[1]
+        num_c = 0
+        if self.with_kp:
+            num_c += all_kps.shape[2]
+        if self.with_limb:
+            num_c += len(self.skeletons)
+
+        ret = np.zeros([num_frame, num_c, img_h, img_w], dtype=np.float32)
 
-        imgs = []
         for i in range(num_frame):
-            sigma = self.sigma
+            # M, V, C
             kps = all_kps[:, i]
-            kpscores = all_kpscores[:, i]
-
-            max_values = np.ones(kpscores.shape, dtype=np.float32)
-            if self.use_score:
-                max_values = kpscores
-
-            hmap = self.generate_heatmap(img_h, img_w, kps, sigma, max_values)
-            imgs.append(hmap)
+            # M, C
+            kpscores = all_kpscores[:, i] if self.use_score else \
+                np.ones_like(all_kpscores[:, i])
 
-        return imgs
+            self.generate_heatmap(ret[i], kps, kpscores)
+        return ret
 
-    def transform(self, results):
+    def transform(self, results: Dict) -> Dict:
         """Generate pseudo heatmaps based on joint coordinates and confidence.
 
         Args:
             results (dict): The resulting dict to be modified and passed
                 to the next transform in pipeline.
         """
-        if not self.double:
-            results['imgs'] = np.stack(self.gen_an_aug(results))
-        else:
-            results_ = cp.deepcopy(results)
-            flip = Flip(
-                flip_ratio=1, left_kp=self.left_kp, right_kp=self.right_kp)
-            results_ = flip(results_)
-            results['imgs'] = np.concatenate(
-                [self.gen_an_aug(results),
-                 self.gen_an_aug(results_)])
+        heatmap = self.gen_an_aug(results)
+        key = 'heatmap_imgs' if 'imgs' in results else 'imgs'
+
+        if self.double:
+            indices = np.arange(heatmap.shape[1], dtype=np.int64)
+            left, right = (self.left_kp, self.right_kp) if self.with_kp else (
+                self.left_limb, self.right_limb)
+            for l, r in zip(left, right):  # noqa: E741
+                indices[l] = r
+                indices[r] = l
+            heatmap_flip = heatmap[..., ::-1][:, indices]
+            heatmap = np.concatenate([heatmap, heatmap_flip])
+        results[key] = heatmap
         return results
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         repr_str = (f'{self.__class__.__name__}('
                     f'sigma={self.sigma}, '
                     f'use_score={self.use_score}, '
@@ -456,7 +474,10 @@ def __repr__(self):
                     f'skeletons={self.skeletons}, '
                     f'double={self.double}, '
                     f'left_kp={self.left_kp}, '
-                    f'right_kp={self.right_kp})')
+                    f'right_kp={self.right_kp}, '
+                    f'left_limb={self.left_limb}, '
+                    f'right_limb={self.right_limb}, '
+                    f'scaling={self.scaling})')
         return repr_str
 
 
@@ -468,30 +489,38 @@ class PoseCompact(BaseTransform):
     example, if 'padding == 0.25', then the expanded box has unchanged center,
     and 1.25x width and height.
 
-    Required keys in results are "img_shape", "keypoint", add or modified keys
-    are "img_shape", "keypoint", "crop_quadruple".
+    Required Keys:
+
+        - keypoint
+        - img_shape
+
+    Modified Keys:
+
+        - img_shape
+        - keypoint
+
+    Added Keys:
+
+        - crop_quadruple
 
     Args:
-        padding (float): The padding size. Default: 0.25.
+        padding (float): The padding size. Defaults to 0.25.
         threshold (int): The threshold for the tight bounding box. If the width
             or height of the tight bounding box is smaller than the threshold,
-            we do not perform the compact operation. Default: 10.
+            we do not perform the compact operation. Defaults to 10.
         hw_ratio (float | tuple[float] | None): The hw_ratio of the expanded
             box. Float indicates the specific ratio and tuple indicates a
             ratio range. If set as None, it means there is no requirement on
-            hw_ratio. Default: None.
+            hw_ratio. Defaults to None.
         allow_imgpad (bool): Whether to allow expanding the box outside the
-            image to meet the hw_ratio requirement. Default: True.
-
-    Returns:
-        type: Description of returned object.
+            image to meet the hw_ratio requirement. Defaults to True.
     """
 
     def __init__(self,
-                 padding=0.25,
-                 threshold=10,
-                 hw_ratio=None,
-                 allow_imgpad=True):
+                 padding: float = 0.25,
+                 threshold: int = 10,
+                 hw_ratio: Optional[Union[float, Tuple[float]]] = None,
+                 allow_imgpad: bool = True) -> None:
 
         self.padding = padding
         self.threshold = threshold
@@ -503,7 +532,7 @@ def __init__(self,
         self.allow_imgpad = allow_imgpad
         assert self.padding >= 0
 
-    def transform(self, results):
+    def transform(self, results: Dict) -> Dict:
         """Convert the coordinates of keypoints to make it more compact.
 
         Args:
@@ -561,7 +590,7 @@ def transform(self, results):
         results['crop_quadruple'] = crop_quadruple
         return results
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '
                     f'threshold={self.threshold}, '
                     f'hw_ratio={self.hw_ratio}, '
@@ -1167,7 +1196,7 @@ def transform(self, results: Dict) -> Dict:
         results['num_clips'] = self.num_clips
         return results
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         repr_str = (f'{self.__class__.__name__}('
                     f'clip_len={self.clip_len}, '
                     f'num_clips={self.num_clips}, '
@@ -1253,6 +1282,17 @@ class PoseDecode(BaseTransform):
         - keypoint_score (optional)
     """
 
+    @staticmethod
+    def _load_kp(kp: np.ndarray, frame_inds: np.ndarray) -> np.ndarray:
+        """Load keypoints according to sampled indexes."""
+        return kp[:, frame_inds].astype(np.float32)
+
+    @staticmethod
+    def _load_kpscore(kpscore: np.ndarray,
+                      frame_inds: np.ndarray) -> np.ndarray:
+        """Load keypoint scores according to sampled indexes."""
+        return kpscore[:, frame_inds].astype(np.float32)
+
     def transform(self, results: Dict) -> Dict:
         """The transform function of :class:`PoseDecode`.
 
@@ -1274,16 +1314,256 @@ def transform(self, results: Dict) -> Dict:
         offset = results.get('offset', 0)
         frame_inds = results['frame_inds'] + offset
 
-        results['keypoint'] = results['keypoint'][:, frame_inds].astype(
-            np.float32)
-
         if 'keypoint_score' in results:
-            kpscore = results['keypoint_score']
-            results['keypoint_score'] = kpscore[:,
-                                                frame_inds].astype(np.float32)
+            results['keypoint_score'] = self._load_kpscore(
+                results['keypoint_score'], frame_inds)
+
+        results['keypoint'] = self._load_kp(results['keypoint'], frame_inds)
 
         return results
 
     def __repr__(self) -> str:
         repr_str = f'{self.__class__.__name__}()'
         return repr_str
+
+
+@TRANSFORMS.register_module()
+class MMUniformSampleFrames(UniformSampleFrames):
+    """Uniformly sample frames from the multi-modal data."""
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`MMUniformSampleFrames`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        num_frames = results['total_frames']
+        modalities = []
+        for modality, clip_len in self.clip_len.items():
+            if self.test_mode:
+                inds = self._get_test_clips(num_frames, clip_len)
+            else:
+                inds = self._get_train_clips(num_frames, clip_len)
+            inds = np.mod(inds, num_frames)
+            results[f'{modality}_inds'] = inds.astype(np.int)
+            modalities.append(modality)
+        results['clip_len'] = self.clip_len
+        results['frame_interval'] = None
+        results['num_clips'] = self.num_clips
+        if not isinstance(results['modality'], list):
+            # should override
+            results['modality'] = modalities
+        return results
+
+
+@TRANSFORMS.register_module()
+class MMDecode(DecordInit, DecordDecode, PoseDecode):
+    """Decode RGB videos and skeletons."""
+
+    def __init__(self, io_backend: str = 'disk', **kwargs) -> None:
+        DecordInit.__init__(self, io_backend=io_backend, **kwargs)
+        DecordDecode.__init__(self)
+        self.io_backend = io_backend
+        self.kwargs = kwargs
+        self.file_client = None
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`MMDecode`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        for mod in results['modality']:
+            if results[f'{mod}_inds'].ndim != 1:
+                results[f'{mod}_inds'] = np.squeeze(results[f'{mod}_inds'])
+            frame_inds = results[f'{mod}_inds']
+            if mod == 'RGB':
+                if 'filename' not in results:
+                    results['filename'] = results['frame_dir'] + '.mp4'
+                video_reader = self._get_video_reader(results['filename'])
+                imgs = self._decord_load_frames(video_reader, frame_inds)
+                del video_reader
+                results['imgs'] = imgs
+            elif mod == 'Pose':
+                assert 'keypoint' in results
+                if 'keypoint_score' not in results:
+                    keypoint_score = [
+                        np.ones(keypoint.shape[:-1], dtype=np.float32)
+                        for keypoint in results['keypoint']
+                    ]
+                    results['keypoint_score'] = np.stack(keypoint_score)
+                results['keypoint'] = self._load_kp(results['keypoint'],
+                                                    frame_inds)
+                results['keypoint_score'] = self._load_kpscore(
+                    results['keypoint_score'], frame_inds)
+            else:
+                raise NotImplementedError(
+                    f'MMDecode: Modality {mod} not supported')
+
+        # We need to scale human keypoints to the new image size
+        if 'imgs' in results and 'keypoint' in results:
+            real_img_shape = results['imgs'][0].shape[:2]
+            if real_img_shape != results['img_shape']:
+                oh, ow = results['img_shape']
+                nh, nw = real_img_shape
+
+                assert results['keypoint'].shape[-1] in [2, 3]
+                results['keypoint'][..., 0] *= (nw / ow)
+                results['keypoint'][..., 1] *= (nh / oh)
+                results['img_shape'] = real_img_shape
+                results['original_shape'] = real_img_shape
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'io_backend={self.io_backend})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MMCompact(BaseTransform):
+    """Convert the coordinates of keypoints and crop the images to make them
+    more compact.
+
+    Required Keys:
+
+        - imgs
+        - keypoint
+        - img_shape
+
+    Modified Keys:
+
+        - imgs
+        - keypoint
+        - img_shape
+
+    Args:
+        padding (float): The padding size. Defaults to 0.25.
+        threshold (int): The threshold for the tight bounding box. If the width
+            or height of the tight bounding box is smaller than the threshold,
+            we do not perform the compact operation. Defaults to 10.
+        hw_ratio (float | tuple[float]): The hw_ratio of the expanded
+            box. Float indicates the specific ratio and tuple indicates a
+            ratio range. If set as None, it means there is no requirement on
+            hw_ratio. Defaults to 1.
+        allow_imgpad (bool): Whether to allow expanding the box outside the
+            image to meet the hw_ratio requirement. Defaults to True.
+    """
+
+    def __init__(self,
+                 padding: float = 0.25,
+                 threshold: int = 10,
+                 hw_ratio: Union[float, Tuple[float]] = 1,
+                 allow_imgpad: bool = True) -> None:
+
+        self.padding = padding
+        self.threshold = threshold
+        if hw_ratio is not None:
+            hw_ratio = _pair(hw_ratio)
+        self.hw_ratio = hw_ratio
+        self.allow_imgpad = allow_imgpad
+        assert self.padding >= 0
+
+    def _get_box(self, keypoint: np.ndarray, img_shape: Tuple[int]) -> Tuple:
+        """Calculate the bounding box surrounding all joints in the frames."""
+        h, w = img_shape
+
+        kp_x = keypoint[..., 0]
+        kp_y = keypoint[..., 1]
+
+        min_x = np.min(kp_x[kp_x != 0], initial=np.Inf)
+        min_y = np.min(kp_y[kp_y != 0], initial=np.Inf)
+        max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf)
+        max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf)
+
+        # The compact area is too small
+        if max_x - min_x < self.threshold or max_y - min_y < self.threshold:
+            return 0, 0, w, h
+
+        center = ((max_x + min_x) / 2, (max_y + min_y) / 2)
+        half_width = (max_x - min_x) / 2 * (1 + self.padding)
+        half_height = (max_y - min_y) / 2 * (1 + self.padding)
+
+        if self.hw_ratio is not None:
+            half_height = max(self.hw_ratio[0] * half_width, half_height)
+            half_width = max(1 / self.hw_ratio[1] * half_height, half_width)
+
+        min_x, max_x = center[0] - half_width, center[0] + half_width
+        min_y, max_y = center[1] - half_height, center[1] + half_height
+
+        # hot update
+        if not self.allow_imgpad:
+            min_x, min_y = int(max(0, min_x)), int(max(0, min_y))
+            max_x, max_y = int(min(w, max_x)), int(min(h, max_y))
+        else:
+            min_x, min_y = int(min_x), int(min_y)
+            max_x, max_y = int(max_x), int(max_y)
+        return min_x, min_y, max_x, max_y
+
+    def _compact_images(self, imgs: List[np.ndarray], img_shape: Tuple[int],
+                        box: Tuple[int]) -> List:
+        """Crop the images acoordding the bounding box."""
+        h, w = img_shape
+        min_x, min_y, max_x, max_y = box
+        pad_l, pad_u, pad_r, pad_d = 0, 0, 0, 0
+        if min_x < 0:
+            pad_l = -min_x
+            min_x, max_x = 0, max_x + pad_l
+            w += pad_l
+        if min_y < 0:
+            pad_u = -min_y
+            min_y, max_y = 0, max_y + pad_u
+            h += pad_u
+        if max_x > w:
+            pad_r = max_x - w
+            w = max_x
+        if max_y > h:
+            pad_d = max_y - h
+            h = max_y
+
+        if pad_l > 0 or pad_r > 0 or pad_u > 0 or pad_d > 0:
+            imgs = [
+                np.pad(img, ((pad_u, pad_d), (pad_l, pad_r), (0, 0)))
+                for img in imgs
+            ]
+        imgs = [img[min_y:max_y, min_x:max_x] for img in imgs]
+        return imgs
+
+    def transform(self, results: Dict) -> Dict:
+        """The transform function of :class:`MMCompact`.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
+        img_shape = results['img_shape']
+        kp = results['keypoint']
+        # Make NaN zero
+        kp[np.isnan(kp)] = 0.
+        min_x, min_y, max_x, max_y = self._get_box(kp, img_shape)
+
+        kp_x, kp_y = kp[..., 0], kp[..., 1]
+        kp_x[kp_x != 0] -= min_x
+        kp_y[kp_y != 0] -= min_y
+
+        new_shape = (max_y - min_y, max_x - min_x)
+        results['img_shape'] = new_shape
+        results['imgs'] = self._compact_images(results['imgs'], img_shape,
+                                               (min_x, min_y, max_x, max_y))
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '
+                    f'threshold={self.threshold}, '
+                    f'hw_ratio={self.hw_ratio}, '
+                    f'allow_imgpad={self.allow_imgpad})')
+        return repr_str
diff --git a/mmaction/evaluation/metrics/acc_metric.py b/mmaction/evaluation/metrics/acc_metric.py
index 488e28aa14..ca6b4623f8 100644
--- a/mmaction/evaluation/metrics/acc_metric.py
+++ b/mmaction/evaluation/metrics/acc_metric.py
@@ -1,12 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 from collections import OrderedDict
-from typing import Any, Optional, Sequence, Tuple, Union
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 from mmengine.evaluator import BaseMetric
 
-from mmaction.evaluation import (mean_average_precision, mean_class_accuracy,
+from mmaction.evaluation import (get_weighted_score, mean_average_precision,
+                                 mean_class_accuracy,
                                  mmit_mean_average_precision, top_k_accuracy)
 from mmaction.registry import METRICS
 
@@ -22,7 +23,7 @@ def __init__(
                                         Tuple[str]]] = ('top_k_accuracy',
                                                         'mean_class_accuracy'),
             collect_device: str = 'cpu',
-            metric_options: Optional[dict] = dict(
+            metric_options: Optional[Dict] = dict(
                 top_k_accuracy=dict(topk=(1, 5))),
             prefix: Optional[str] = None,
             num_classes: Optional[int] = None):
@@ -56,38 +57,84 @@ def __init__(
         self.metric_options = metric_options
         self.num_classes = num_classes
 
-    def process(self, data_batch: Sequence[Tuple[Any, dict]],
-                data_samples: Sequence[dict]) -> None:
+    def process(self, data_batch: Sequence[Tuple[Any, Dict]],
+                data_samples: Sequence[Dict]) -> None:
         """Process one batch of data samples and data_samples. The processed
         results should be stored in ``self.results``, which will be used to
         compute the metrics when all batches have been processed.
 
         Args:
-            data_batch (Sequence[Tuple[Any, dict]]): A batch of data
-                from the dataloader.
-            data_samples (Sequence[dict]): A batch of outputs from
-                the model.
+            data_batch (Sequence[dict]): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
         """
+        data_samples = copy.deepcopy(data_samples)
         for data_sample in data_samples:
             result = dict()
             pred = data_sample['pred_scores']
             label = data_sample['gt_labels']
-            result['pred'] = pred['item'].cpu().numpy()
+            for item_name, score in pred.items():
+                pred[item_name] = score.cpu().numpy()
+            result['pred'] = pred
             result['label'] = label['item'].item()
             self.results.append(result)
 
-    def compute_metrics(self, results: list) -> dict:
+    def compute_metrics(self, results: List) -> Dict:
         """Compute the metrics from processed results.
 
         Args:
             results (list): The processed results of each batch.
+
         Returns:
             dict: The computed metrics. The keys are the names of the metrics,
             and the values are corresponding results.
         """
-        preds = [x['pred'] for x in results]
         labels = [x['label'] for x in results]
 
+        if len(results[0]['pred']) == 1:
+            preds = [x['pred']['item'] for x in results]
+            return self.calculate(preds, labels)
+
+        eval_results = dict()
+        for item_name in results[0]['pred'].keys():
+            preds = [x['pred'][item_name] for x in results]
+            eval_result = self.calculate(preds, labels)
+            eval_results.update(
+                {f'{item_name}_{k}': v
+                 for k, v in eval_result.items()})
+
+        # Ad-hoc for RGBPoseConv3D
+        if len(results[0]['pred']) == 2 and \
+                'rgb' in results[0]['pred'] and \
+                'pose' in results[0]['pred']:
+
+            rgb = [x['pred']['rgb'] for x in results]
+            pose = [x['pred']['pose'] for x in results]
+
+            preds = {
+                '1:1': get_weighted_score([rgb, pose], [1, 1]),
+                '2:1': get_weighted_score([rgb, pose], [2, 1]),
+                '1:2': get_weighted_score([rgb, pose], [1, 2])
+            }
+            for k in preds:
+                eval_result = self.calculate(preds[k], labels)
+                eval_results.update({
+                    f'RGBPose_{k}_{key}': v
+                    for key, v in eval_result.items()
+                })
+
+        return eval_results
+
+    def calculate(self, preds: List[np.ndarray], labels: List[int]) -> Dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            preds (list[np.ndarray]): List of the prediction scores.
+            labels (list[int]): List of the labels.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
         eval_results = OrderedDict()
         metric_options = copy.deepcopy(self.metric_options)
         for metric in self.metrics:
diff --git a/mmaction/models/backbones/__init__.py b/mmaction/models/backbones/__init__.py
index 066ba18535..2f4eb4a7e3 100644
--- a/mmaction/models/backbones/__init__.py
+++ b/mmaction/models/backbones/__init__.py
@@ -15,6 +15,7 @@
 from .resnet_omni import OmniResNet
 from .resnet_tin import ResNetTIN
 from .resnet_tsm import ResNetTSM
+from .rgbposeconv3d import RGBPoseConv3D
 from .stgcn import STGCN
 from .swin import SwinTransformer3D
 from .tanet import TANet
@@ -29,5 +30,6 @@
     'OmniResNet', 'ResNet', 'ResNet2Plus1d', 'ResNet3d', 'ResNet3dCSN',
     'ResNet3dLayer', 'ResNet3dSlowFast', 'ResNet3dSlowOnly', 'ResNetAudio',
     'ResNetTIN', 'ResNetTSM', 'STGCN', 'SwinTransformer3D', 'TANet',
-    'TimeSformer', 'UniFormer', 'UniFormerV2', 'VisionTransformer', 'X3D'
+    'TimeSformer', 'UniFormer', 'UniFormerV2', 'VisionTransformer', 'X3D',
+    'RGBPoseConv3D'
 ]
diff --git a/mmaction/models/backbones/resnet3d.py b/mmaction/models/backbones/resnet3d.py
index 50435c3064..cbaa4e18ca 100644
--- a/mmaction/models/backbones/resnet3d.py
+++ b/mmaction/models/backbones/resnet3d.py
@@ -1,22 +1,23 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 from collections import OrderedDict
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
+import torch
+import torch.nn as nn
 import torch.utils.checkpoint as cp
 from mmcv.cnn import ConvModule, NonLocal3d, build_activation_layer
 from mmengine.logging import MMLogger
+from mmengine.model import BaseModule, Sequential
 from mmengine.model.weight_init import constant_init, kaiming_init
 from mmengine.runner.checkpoint import _load_checkpoint, load_checkpoint
 from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
-from torch import Tensor, nn
 from torch.nn.modules.utils import _ntuple, _triple
 
 from mmaction.registry import MODELS
-from mmaction.utils import ConfigType, OptConfigType
 
 
-class BasicBlock3d(nn.Module):
+class BasicBlock3d(BaseModule):
     """BasicBlock 3d block for ResNet3D.
 
     Args:
@@ -28,22 +29,24 @@ class BasicBlock3d(nn.Module):
             Defaults to 1.
         dilation (int): Spacing between kernel elements. Defaults to 1.
         downsample (nn.Module or None): Downsample layer. Defaults to None.
-        style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+        style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
             stride-two layer is the 3x3 conv layer, otherwise the stride-two
-            layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+            layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
         inflate (bool): Whether to inflate kernel. Defaults to True.
         non_local (bool): Determine whether to apply non-local module in this
             block. Defaults to False.
-        non_local_cfg (dict or ConfigDict): Config for non-local module.
+        non_local_cfg (dict): Config for non-local module.
             Defaults to ``dict()``.
-        conv_cfg (dict or ConfigDict): Config dict for convolution layer.
+        conv_cfg (dict): Config dict for convolution layer.
             Defaults to ``dict(type='Conv3d')``.
-        norm_cfg (dict or ConfigDict): Config for norm layers.
+        norm_cfg (dict): Config for norm layers.
             Required keys are ``type``. Defaults to ``dict(type='BN3d')``.
-        act_cfg (dict or ConfigDict): Config dict for activation layer.
+        act_cfg (dict): Config dict for activation layer.
             Defaults to ``dict(type='ReLU')``.
         with_cp (bool): Use checkpoint or not. Using checkpoint will save some
             memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
     """
     expansion = 1
 
@@ -57,13 +60,14 @@ def __init__(self,
                  style: str = 'pytorch',
                  inflate: bool = True,
                  non_local: bool = False,
-                 non_local_cfg: ConfigType = dict(),
-                 conv_cfg: ConfigType = dict(type='Conv3d'),
-                 norm_cfg: ConfigType = dict(type='BN3d'),
-                 act_cfg: ConfigType = dict(type='ReLU'),
+                 non_local_cfg: Dict = dict(),
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 norm_cfg: Dict = dict(type='BN3d'),
+                 act_cfg: Dict = dict(type='ReLU'),
                  with_cp: bool = False,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None,
                  **kwargs) -> None:
-        super().__init__()
+        super().__init__(init_cfg=init_cfg)
         assert style in ['pytorch', 'caffe']
         # make sure that only ``inflate_style`` is passed into kwargs
         assert set(kwargs).issubset(['inflate_style'])
@@ -130,7 +134,7 @@ def __init__(self,
             self.non_local_block = NonLocal3d(self.conv2.norm.num_features,
                                               **self.non_local_cfg)
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Defines the computation performed at every call."""
 
         def _inner_forward(x):
@@ -158,7 +162,7 @@ def _inner_forward(x):
         return out
 
 
-class Bottleneck3d(nn.Module):
+class Bottleneck3d(BaseModule):
     """Bottleneck 3d block for ResNet3D.
 
     Args:
@@ -170,25 +174,27 @@ class Bottleneck3d(nn.Module):
             Defaults to 1.
         dilation (int): Spacing between kernel elements. Defaults to 1.
         downsample (nn.Module, optional): Downsample layer. Defaults to None.
-        style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+        style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
             stride-two layer is the 3x3 conv layer, otherwise the stride-two
-            layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+            layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
         inflate (bool): Whether to inflate kernel. Defaults to True.
-        inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
+        inflate_style (str): '3x1x1' or '3x3x3'. which determines the
             kernel sizes and padding strides for conv1 and conv2 in each block.
-            Defaults to ``3x1x1``.
+            Defaults to ``'3x1x1'``.
         non_local (bool): Determine whether to apply non-local module in this
             block. Defaults to False.
-        non_local_cfg (dict or ConfigDict): Config for non-local module.
+        non_local_cfg (dict): Config for non-local module.
             Defaults to ``dict()``.
-        conv_cfg (dict or ConfigDict): Config dict for convolution layer.
+        conv_cfg (dict): Config dict for convolution layer.
             Defaults to ``dict(type='Conv3d')``.
-        norm_cfg (dict or ConfigDict): Config for norm layers. required
+        norm_cfg (dict): Config for norm layers. required
             keys are ``type``. Defaults to ``dict(type='BN3d')``.
-        act_cfg (dict or ConfigDict): Config dict for activation layer.
+        act_cfg (dict): Config dict for activation layer.
             Defaults to ``dict(type='ReLU')``.
         with_cp (bool): Use checkpoint or not. Using checkpoint will save some
             memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
     """
     expansion = 4
 
@@ -203,12 +209,13 @@ def __init__(self,
                  inflate: bool = True,
                  inflate_style: str = '3x1x1',
                  non_local: bool = False,
-                 non_local_cfg: ConfigType = dict(),
-                 conv_cfg: ConfigType = dict(type='Conv3d'),
-                 norm_cfg: ConfigType = dict(type='BN3d'),
-                 act_cfg: ConfigType = dict(type='ReLU'),
-                 with_cp: bool = False) -> None:
-        super().__init__()
+                 non_local_cfg: Dict = dict(),
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 norm_cfg: Dict = dict(type='BN3d'),
+                 act_cfg: Dict = dict(type='ReLU'),
+                 with_cp: bool = False,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
         assert style in ['pytorch', 'caffe']
         assert inflate_style in ['3x1x1', '3x3x3']
 
@@ -297,7 +304,7 @@ def __init__(self,
             self.non_local_block = NonLocal3d(self.conv3.norm.num_features,
                                               **self.non_local_cfg)
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Defines the computation performed at every call."""
 
         def _inner_forward(x):
@@ -327,23 +334,23 @@ def _inner_forward(x):
 
 
 @MODELS.register_module()
-class ResNet3d(nn.Module):
+class ResNet3d(BaseModule):
     """ResNet 3d backbone.
 
     Args:
-        depth (int): Depth of resnet, from
-            {``18``, ``34``, ``50``, ``101``, ``152``}.
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+            Defaults to 50.
         pretrained (str, optional): Name of pretrained model. Defaults to None.
         stage_blocks (tuple, optional): Set number of stages for each res
             layer. Defaults to None.
         pretrained2d (bool): Whether to load pretrained 2D model.
             Defaults to True.
         in_channels (int): Channel num of input features. Defaults to 3.
+        num_stages (int): Resnet stages. Defaults to 4.
         base_channels (int): Channel num of stem output features.
             Defaults to 64.
         out_indices (Sequence[int]): Indices of output feature.
-            Defaults to ```(3, )``.
-        num_stages (int): Resnet stages. Defaults to 4.
+            Defaults to ``(3, )``.
         spatial_strides (Sequence[int]):
             Spatial strides of residual blocks of each stage.
             Defaults to ``(1, 2, 2, 2)``.
@@ -363,9 +370,9 @@ class ResNet3d(nn.Module):
         pool1_stride_t (int): Temporal stride of the first pooling layer.
             Defaults to 1.
         with_pool2 (bool): Whether to use pool2. Defaults to True.
-        style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+        style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
             stride-two layer is the 3x3 conv layer, otherwise the stride-two
-            layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+            layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
         frozen_stages (int): Stages to be frozen (all param fixed). -1 means
             not freezing any parameters. Defaults to -1.
         inflate (Sequence[int]): Inflate Dims of each block.
@@ -373,12 +380,12 @@ class ResNet3d(nn.Module):
         inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
             kernel sizes and padding strides for conv1 and conv2 in each block.
             Defaults to ``3x1x1``.
-        conv_cfg (dict or ConfigDict): Config for conv layers.
+        conv_cfg (dict): Config for conv layers.
             Required keys are ``type``. Defaults to ``dict(type='Conv3d')``.
-        norm_cfg (dict or ConfigDict): Config for norm layers.
+        norm_cfg (dict): Config for norm layers.
             Required keys are ``type`` and ``requires_grad``.
             Defaults to ``dict(type='BN3d', requires_grad=True)``.
-        act_cfg (dict or ConfigDict): Config dict for activation layer.
+        act_cfg (dict): Config dict for activation layer.
             Defaults to ``dict(type='ReLU', inplace=True)``.
         norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
             running stats (``mean`` and ``var``). Defaults to False.
@@ -387,11 +394,13 @@ class ResNet3d(nn.Module):
         non_local (Sequence[int]): Determine whether to apply non-local module
             in the corresponding block of each stages.
             Defaults to ``(0, 0, 0, 0)``.
-        non_local_cfg (dict or ConfigDict): Config for non-local module.
+        non_local_cfg (dict): Config for non-local module.
             Defaults to ``dict()``.
         zero_init_residual (bool):
             Whether to use zero initialization for residual block,
             Defaults to True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
     """
 
     arch_settings = {
@@ -403,7 +412,7 @@ class ResNet3d(nn.Module):
     }
 
     def __init__(self,
-                 depth: int,
+                 depth: int = 50,
                  pretrained: Optional[str] = None,
                  stage_blocks: Optional[Tuple] = None,
                  pretrained2d: bool = True,
@@ -425,16 +434,17 @@ def __init__(self,
                  frozen_stages: int = -1,
                  inflate: Sequence[int] = (1, 1, 1, 1),
                  inflate_style: str = '3x1x1',
-                 conv_cfg: ConfigType = dict(type='Conv3d'),
-                 norm_cfg: ConfigType = dict(type='BN3d', requires_grad=True),
-                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 norm_cfg: Dict = dict(type='BN3d', requires_grad=True),
+                 act_cfg: Dict = dict(type='ReLU', inplace=True),
                  norm_eval: bool = False,
                  with_cp: bool = False,
                  non_local: Sequence[int] = (0, 0, 0, 0),
-                 non_local_cfg: ConfigType = dict(),
+                 non_local_cfg: Dict = dict(),
                  zero_init_residual: bool = True,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None,
                  **kwargs) -> None:
-        super().__init__()
+        super().__init__(init_cfg=init_cfg)
         if depth not in self.arch_settings:
             raise KeyError(f'invalid depth {depth} for resnet')
         self.depth = depth
@@ -486,6 +496,8 @@ def __init__(self,
         self._make_stem_layer()
 
         self.res_layers = []
+        lateral_inplanes = getattr(self, 'lateral_inplanes', [0, 0, 0, 0])
+
         for i, num_blocks in enumerate(self.stage_blocks):
             spatial_stride = spatial_strides[i]
             temporal_stride = temporal_strides[i]
@@ -493,7 +505,7 @@ def __init__(self,
             planes = self.base_channels * 2**i
             res_layer = self.make_res_layer(
                 self.block,
-                self.inplanes,
+                self.inplanes + lateral_inplanes[i],
                 planes,
                 num_blocks,
                 spatial_stride=spatial_stride,
@@ -514,8 +526,8 @@ def __init__(self,
             self.add_module(layer_name, res_layer)
             self.res_layers.append(layer_name)
 
-        self.feat_dim = self.block.expansion * self.base_channels * 2**(
-            len(self.stage_blocks) - 1)
+        self.feat_dim = self.block.expansion * \
+            self.base_channels * 2 ** (len(self.stage_blocks) - 1)
 
     @staticmethod
     def make_res_layer(block: nn.Module,
@@ -529,11 +541,11 @@ def make_res_layer(block: nn.Module,
                        inflate: Union[int, Sequence[int]] = 1,
                        inflate_style: str = '3x1x1',
                        non_local: Union[int, Sequence[int]] = 0,
-                       non_local_cfg: ConfigType = dict(),
-                       norm_cfg: OptConfigType = None,
-                       act_cfg: OptConfigType = None,
-                       conv_cfg: OptConfigType = None,
-                       with_cp: Optional[bool] = False,
+                       non_local_cfg: Dict = dict(),
+                       norm_cfg: Optional[Dict] = None,
+                       act_cfg: Optional[Dict] = None,
+                       conv_cfg: Optional[Dict] = None,
+                       with_cp: bool = False,
                        **kwargs) -> nn.Module:
         """Build residual layer for ResNet3D.
 
@@ -549,25 +561,25 @@ def make_res_layer(block: nn.Module,
             temporal_stride (int | Sequence[int]): Temporal strides in
                 residual and conv layers. Defaults to 1.
             dilation (int): Spacing between kernel elements. Defaults to 1.
-            style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``,
-                the stride-two layer is the 3x3 conv layer, otherwise
-                the stride-two layer is the first 1x1 conv layer.
-                Default: ``pytorch``.
+            style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
+                stride-two layer is the 3x3 conv layer,otherwise the
+                stride-two layer is the first 1x1 conv layer.
+                Defaults to ``'pytorch'``.
             inflate (int | Sequence[int]): Determine whether to inflate
                 for each block. Defaults to 1.
             inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines
                 the kernel sizes and padding strides for conv1 and conv2
-                in each block. Default: ``3x1x1``.
+                in each block. Default: ``'3x1x1'``.
             non_local (int | Sequence[int]): Determine whether to apply
                 non-local module in the corresponding block of each stages.
                 Defaults to 0.
             non_local_cfg (dict): Config for non-local module.
                 Defaults to ``dict()``.
-            conv_cfg (dict or ConfigDict, optional): Config for conv layers.
+            conv_cfg (dict, optional): Config for conv layers.
                 Defaults to None.
-            norm_cfg (dict or ConfigDict, optional): Config for norm layers.
+            norm_cfg (dict, optional): Config for norm layers.
                 Defaults to None.
-            act_cfg (dict or ConfigDict, optional): Config for activate layers.
+            act_cfg (dict, optional): Config for activate layers.
                 Defaults to None.
             with_cp (bool, optional): Use checkpoint or not. Using checkpoint
                 will save some memory while slowing down the training speed.
@@ -576,10 +588,10 @@ def make_res_layer(block: nn.Module,
         Returns:
             nn.Module: A residual layer for the given config.
         """
-        inflate = inflate if not isinstance(inflate,
-                                            int) else (inflate, ) * blocks
-        non_local = non_local if not isinstance(
-            non_local, int) else (non_local, ) * blocks
+        inflate = inflate if not isinstance(inflate, int) \
+            else (inflate,) * blocks
+        non_local = non_local if not isinstance(non_local, int) \
+            else (non_local,) * blocks
         assert len(inflate) == blocks and len(non_local) == blocks
         downsample = None
         if spatial_stride != 1 or inplanes != planes * block.expansion:
@@ -632,7 +644,7 @@ def make_res_layer(block: nn.Module,
                     with_cp=with_cp,
                     **kwargs))
 
-        return nn.Sequential(*layers)
+        return Sequential(*layers)
 
     @staticmethod
     def _inflate_conv_params(conv3d: nn.Module, state_dict_2d: OrderedDict,
@@ -645,7 +657,7 @@ def _inflate_conv_params(conv3d: nn.Module, state_dict_2d: OrderedDict,
             state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
             module_name_2d (str): The name of corresponding conv module in the
                 2d model.
-            inflated_param_names (List[str]): List of parameters that have been
+            inflated_param_names (list[str]): List of parameters that have been
                 inflated.
         """
         weight_2d_name = module_name_2d + '.weight'
@@ -674,7 +686,7 @@ def _inflate_bn_params(bn3d: nn.Module, state_dict_2d: OrderedDict,
             state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
             module_name_2d (str): The name of corresponding bn module in the
                 2d model.
-            inflated_param_names (List[str]): List of parameters that have been
+            inflated_param_names (list[str]): List of parameters that have been
                 inflated.
         """
         for param_name, param in bn3d.named_parameters():
@@ -811,7 +823,7 @@ def _init_weights(self, pretrained: Optional[str] = None) -> None:
         Args:
             pretrained (str | None): The path of the pretrained weight. Will
                 override the original `pretrained` if set. The arg is added to
-                be compatible with mmdet. Default: None.
+                be compatible with mmdet. Defaults to None.
         """
         if pretrained:
             self.pretrained = pretrained
@@ -822,7 +834,6 @@ def _init_weights(self, pretrained: Optional[str] = None) -> None:
             if self.pretrained2d:
                 # Inflate 2D model into 3D model.
                 self.inflate_weights(logger)
-
             else:
                 # Directly load 3D model.
                 load_checkpoint(
@@ -848,15 +859,16 @@ def init_weights(self, pretrained: Optional[str] = None) -> None:
         """Initialize weights."""
         self._init_weights(self, pretrained)
 
-    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
+    def forward(self, x: torch.Tensor) \
+            -> Union[torch.Tensor, Tuple[torch.Tensor]]:
         """Defines the computation performed at every call.
 
         Args:
-            x (Tensor): The input data.
+            x (torch.Tensor): The input data.
 
         Returns:
-            Tensor or Tuple[Tensor]: The feature of the input
-                samples extracted by the backbone.
+            torch.Tensor or tuple[torch.Tensor]: The feature of the input
+            samples extracted by the backbone.
         """
         x = self.conv1(x)
         if self.with_pool1:
@@ -885,12 +897,11 @@ def train(self, mode: bool = True) -> None:
 
 
 @MODELS.register_module()
-class ResNet3dLayer(nn.Module):
+class ResNet3dLayer(BaseModule):
     """ResNet 3d Layer.
 
     Args:
-        depth (int): Depth of resnet,
-            from {``18``, ``34``, ``50``, ``101``, ``152``}.
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
         pretrained (str, optional): Name of pretrained model. Defaults to None.
         pretrained2d (bool): Whether to load pretrained 2D model.
             Defaults to True.
@@ -902,20 +913,20 @@ class ResNet3dLayer(nn.Module):
         temporal_stride (int): The 1st res block's temporal stride.
             Defaults to 1.
         dilation (int): The dilation. Defaults to 1.
-        style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+        style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
             stride-two layer is the 3x3 conv layer, otherwise the stride-two
-            layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+            layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
         all_frozen (bool): Frozen all modules in the layer. Defaults to False.
         inflate (int): Inflate dims of each block. Defaults to 1.
         inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
             kernel sizes and padding strides for conv1 and conv2 in each block.
-            Defaults to ``3x1x1``.
-        conv_cfg (dict or ConfigDict): Config for conv layers.
+            Defaults to ``'3x1x1'``.
+        conv_cfg (dict): Config for conv layers.
             Required keys are ``type``. Defaults to ``dict(type='Conv3d')``.
-        norm_cfg (dict or ConfigDict): Config for norm layers.
+        norm_cfg (dict): Config for norm layers.
             Required keys are ``type`` and ``requires_grad``.
             Defaults to ``dict(type='BN3d', requires_grad=True)``.
-        act_cfg (dict or ConfigDict): Config dict for activation layer.
+        act_cfg (dict): Config dict for activation layer.
             Defaults to ``dict(type='ReLU', inplace=True)``.
         norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
             running stats (``mean`` and ``var``). Defaults to False.
@@ -924,6 +935,8 @@ class ResNet3dLayer(nn.Module):
         zero_init_residual (bool):
             Whether to use zero initialization for residual block,
             Defaults to True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
     """
 
     def __init__(self,
@@ -939,14 +952,15 @@ def __init__(self,
                  all_frozen: bool = False,
                  inflate: int = 1,
                  inflate_style: str = '3x1x1',
-                 conv_cfg: ConfigType = dict(type='Conv3d'),
-                 norm_cfg: ConfigType = dict(type='BN3d', requires_grad=True),
-                 act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 norm_cfg: Dict = dict(type='BN3d', requires_grad=True),
+                 act_cfg: Dict = dict(type='ReLU', inplace=True),
                  norm_eval: bool = False,
                  with_cp: bool = False,
                  zero_init_residual: bool = True,
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None,
                  **kwargs) -> None:
-        super().__init__()
+        super().__init__(init_cfg=init_cfg)
         self.arch_settings = ResNet3d.arch_settings
         assert depth in self.arch_settings
 
@@ -1022,15 +1036,15 @@ def init_weights(self, pretrained: Optional[str] = None) -> None:
         """Initialize weights."""
         self._init_weights(self, pretrained)
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Defines the computation performed at every call.
 
         Args:
-            x (Tensor): The input data.
+            x (torch.Tensor): The input data.
 
         Returns:
-            Tensor: The feature of the input
-                samples extracted by the resisual layer.
+            torch.Tensor: The feature of the input
+                samples extracted by the residual layer.
         """
         res_layer = getattr(self, self.layer_name)
         out = res_layer(x)
diff --git a/mmaction/models/backbones/resnet3d_slowfast.py b/mmaction/models/backbones/resnet3d_slowfast.py
index 4417882c4b..c4ca8b8032 100644
--- a/mmaction/models/backbones/resnet3d_slowfast.py
+++ b/mmaction/models/backbones/resnet3d_slowfast.py
@@ -1,27 +1,88 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 from collections import OrderedDict
-from typing import List, Optional, Sequence, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule
 from mmengine.logging import MMLogger, print_log
+from mmengine.model import BaseModule
 from mmengine.model.weight_init import kaiming_init
 from mmengine.runner.checkpoint import _load_checkpoint, load_checkpoint
-from torch import Tensor
 
 from mmaction.registry import MODELS
-from mmaction.utils import ConfigType, OptConfigType
 from .resnet3d import ResNet3d
 
 
+class DeConvModule(BaseModule):
+    """A deconv module that bundles deconv/norm/activation layers.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+        stride (int | tuple[int]): Stride of the convolution.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input.
+        bias (bool): Whether to add a learnable bias to the output.
+            Defaults to False.
+        with_bn (bool): Whether to add a BN layer. Defaults to True.
+        with_relu (bool): Whether to add a ReLU layer. Defaults to True.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int,
+                 stride: Union[int, Tuple[int]] = (1, 1, 1),
+                 padding: Union[int, Tuple[int]] = 0,
+                 bias: bool = False,
+                 with_bn: bool = True,
+                 with_relu: bool = True) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.bias = bias
+        self.with_bn = with_bn
+        self.with_relu = with_relu
+
+        self.conv = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias)
+        self.bn = nn.BatchNorm3d(out_channels)
+        self.relu = nn.ReLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Defines the computation performed at every call."""
+        # x should be a 5-d tensor
+        assert len(x.shape) == 5
+        N, C, T, H, W = x.shape
+        out_shape = (N, self.out_channels, self.stride[0] * T,
+                     self.stride[1] * H, self.stride[2] * W)
+        x = self.conv(x, output_size=out_shape)
+        if self.with_bn:
+            x = self.bn(x)
+        if self.with_relu:
+            x = self.relu(x)
+        return x
+
+
 class ResNet3dPathway(ResNet3d):
     """A pathway of Slowfast based on ResNet3d.
 
     Args:
         lateral (bool): Determines whether to enable the lateral connection
             from another pathway. Defaults to False.
+        lateral_inv (bool): Whether to use deconv to upscale the time
+            dimension of features from another pathway. Defaults to False.
         lateral_norm (bool): Determines whether to enable the lateral norm
             in lateral layers. Defaults to False.
         speed_ratio (int): Speed ratio indicating the ratio between time
@@ -32,181 +93,112 @@ class ResNet3dPathway(ResNet3d):
             Defaults to 8.
         fusion_kernel (int): The kernel size of lateral fusion.
             Defaults to 5.
+        lateral_infl (int): The ratio of the inflated channels.
+            Defaults to 2.
+        lateral_activate (list[int]): Flags for activating the lateral
+            connection. Defaults to ``[1, 1, 1, 1]``.
     """
 
     def __init__(self,
-                 *args,
                  lateral: bool = False,
+                 lateral_inv: bool = False,
                  lateral_norm: bool = False,
                  speed_ratio: int = 8,
                  channel_ratio: int = 8,
                  fusion_kernel: int = 5,
+                 lateral_infl: int = 2,
+                 lateral_activate: List[int] = [1, 1, 1, 1],
                  **kwargs) -> None:
         self.lateral = lateral
+        self.lateral_inv = lateral_inv
         self.lateral_norm = lateral_norm
         self.speed_ratio = speed_ratio
         self.channel_ratio = channel_ratio
         self.fusion_kernel = fusion_kernel
-        super().__init__(*args, **kwargs)
+        self.lateral_infl = lateral_infl
+        self.lateral_activate = lateral_activate
+        self._calculate_lateral_inplanes(kwargs)
+
+        super().__init__(**kwargs)
         self.inplanes = self.base_channels
-        if self.lateral:
-            self.conv1_lateral = ConvModule(
-                self.inplanes // self.channel_ratio,
-                # https://arxiv.org/abs/1812.03982, the
-                # third type of lateral connection has out_channel:
-                # 2 * \beta * C
-                self.inplanes * 2 // self.channel_ratio,
-                kernel_size=(fusion_kernel, 1, 1),
-                stride=(self.speed_ratio, 1, 1),
-                padding=((fusion_kernel - 1) // 2, 0, 0),
-                bias=False,
-                conv_cfg=self.conv_cfg,
-                norm_cfg=self.norm_cfg if self.lateral_norm else None,
-                act_cfg=self.act_cfg if self.lateral_norm else None)
+        if self.lateral and self.lateral_activate[0] == 1:
+            if self.lateral_inv:
+                self.conv1_lateral = DeConvModule(
+                    self.inplanes * self.channel_ratio,
+                    self.inplanes * self.channel_ratio // lateral_infl,
+                    kernel_size=(fusion_kernel, 1, 1),
+                    stride=(self.speed_ratio, 1, 1),
+                    padding=((fusion_kernel - 1) // 2, 0, 0),
+                    with_bn=True,
+                    with_relu=True)
+            else:
+                self.conv1_lateral = ConvModule(
+                    self.inplanes // self.channel_ratio,
+                    self.inplanes * lateral_infl // self.channel_ratio,
+                    kernel_size=(fusion_kernel, 1, 1),
+                    stride=(self.speed_ratio, 1, 1),
+                    padding=((fusion_kernel - 1) // 2, 0, 0),
+                    bias=False,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg if self.lateral_norm else None,
+                    act_cfg=self.act_cfg if self.lateral_norm else None)
 
         self.lateral_connections = []
         for i in range(len(self.stage_blocks)):
             planes = self.base_channels * 2**i
             self.inplanes = planes * self.block.expansion
 
-            if lateral and i != self.num_stages - 1:
+            if lateral and i != self.num_stages - 1 \
+                    and self.lateral_activate[i + 1]:
                 # no lateral connection needed in final stage
                 lateral_name = f'layer{(i + 1)}_lateral'
-                setattr(
-                    self, lateral_name,
-                    ConvModule(
+                if self.lateral_inv:
+                    conv_module = DeConvModule(
+                        self.inplanes * self.channel_ratio,
+                        self.inplanes * self.channel_ratio // lateral_infl,
+                        kernel_size=(fusion_kernel, 1, 1),
+                        stride=(self.speed_ratio, 1, 1),
+                        padding=((fusion_kernel - 1) // 2, 0, 0),
+                        bias=False,
+                        with_bn=True,
+                        with_relu=True)
+                else:
+                    conv_module = ConvModule(
                         self.inplanes // self.channel_ratio,
-                        self.inplanes * 2 // self.channel_ratio,
+                        self.inplanes * lateral_infl // self.channel_ratio,
                         kernel_size=(fusion_kernel, 1, 1),
                         stride=(self.speed_ratio, 1, 1),
                         padding=((fusion_kernel - 1) // 2, 0, 0),
                         bias=False,
                         conv_cfg=self.conv_cfg,
                         norm_cfg=self.norm_cfg if self.lateral_norm else None,
-                        act_cfg=self.act_cfg if self.lateral_norm else None))
+                        act_cfg=self.act_cfg if self.lateral_norm else None)
+                setattr(self, lateral_name, conv_module)
                 self.lateral_connections.append(lateral_name)
 
-    def make_res_layer(self,
-                       block: nn.Module,
-                       inplanes: int,
-                       planes: int,
-                       blocks: int,
-                       spatial_stride: Union[int, Sequence[int]] = 1,
-                       temporal_stride: Union[int, Sequence[int]] = 1,
-                       dilation: int = 1,
-                       style: str = 'pytorch',
-                       inflate: Union[int, Sequence[int]] = 1,
-                       inflate_style: str = '3x1x1',
-                       non_local: Union[int, Sequence[int]] = 0,
-                       non_local_cfg: ConfigType = dict(),
-                       norm_cfg: OptConfigType = None,
-                       act_cfg: OptConfigType = None,
-                       conv_cfg: OptConfigType = None,
-                       with_cp: Optional[bool] = False,
-                       **kwargs) -> nn.Module:
-        """Build residual layer for SlowFast.
-
-        Args:
-            block (nn.Module): Residual module to be built.
-            inplanes (int): Number of channels for the input feature
-                in each block.
-            planes (int): Number of channels for the output feature
-                in each block.
-            blocks (int): Number of residual blocks.
-            spatial_stride (int | Sequence[int]): Spatial strides in
-                residual and conv layers. Defaults to 1.
-            temporal_stride (int | Sequence[int]): Temporal strides in
-                residual and conv layers. Defaults to 1.
-            dilation (int): Spacing between kernel elements. Defaults to 1.
-            style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``,
-                the stride-two layer is the 3x3 conv layer, otherwise
-                the stride-two layer is the first 1x1 conv layer.
-                Default: ``pytorch``.
-            inflate (int | Sequence[int]): Determine whether to inflate
-                for each block. Defaults to 1.
-            inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines
-                the kernel sizes and padding strides for conv1 and conv2
-                in each block. Default: ``3x1x1``.
-            non_local (int | Sequence[int]): Determine whether to apply
-                non-local module in the corresponding block of each stages.
-                Defaults to 0.
-            non_local_cfg (dict): Config for non-local module.
-                Defaults to ``dict()``.
-            conv_cfg (dict or ConfigDict, optional): Config for conv layers.
-                Defaults to None.
-            norm_cfg (dict or ConfigDict, optional): Config for norm layers.
-                Defaults to None.
-            act_cfg (dict or ConfigDict, optional): Config for activate layers.
-                Defaults to None.
-            with_cp (bool, optional): Use checkpoint or not. Using checkpoint
-                will save some memory while slowing down the training speed.
-                Defaults to False.
-
-        Returns:
-            nn.Module: A residual layer for the given config.
-        """
-        inflate = inflate if not isinstance(inflate,
-                                            int) else (inflate, ) * blocks
-        non_local = non_local if not isinstance(
-            non_local, int) else (non_local, ) * blocks
-        assert len(inflate) == blocks and len(non_local) == blocks
-        if self.lateral:
-            lateral_inplanes = inplanes * 2 // self.channel_ratio
-        else:
-            lateral_inplanes = 0
-        if (spatial_stride != 1
-                or (inplanes + lateral_inplanes) != planes * block.expansion):
-            downsample = ConvModule(
-                inplanes + lateral_inplanes,
-                planes * block.expansion,
-                kernel_size=1,
-                stride=(temporal_stride, spatial_stride, spatial_stride),
-                bias=False,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=None)
-        else:
-            downsample = None
-
-        layers = []
-        layers.append(
-            block(
-                inplanes + lateral_inplanes,
-                planes,
-                spatial_stride,
-                temporal_stride,
-                dilation,
-                downsample,
-                style=style,
-                inflate=(inflate[0] == 1),
-                inflate_style=inflate_style,
-                non_local=(non_local[0] == 1),
-                non_local_cfg=non_local_cfg,
-                conv_cfg=conv_cfg,
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg,
-                with_cp=with_cp))
-        inplanes = planes * block.expansion
-
-        for i in range(1, blocks):
-            layers.append(
-                block(
-                    inplanes,
-                    planes,
-                    1,
-                    1,
-                    dilation,
-                    style=style,
-                    inflate=(inflate[i] == 1),
-                    inflate_style=inflate_style,
-                    non_local=(non_local[i] == 1),
-                    non_local_cfg=non_local_cfg,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
-                    with_cp=with_cp))
-
-        return nn.Sequential(*layers)
+    def _calculate_lateral_inplanes(self, kwargs):
+        """Calculate inplanes for lateral connection."""
+        depth = kwargs.get('depth', 50)
+        expansion = 1 if depth < 50 else 4
+        base_channels = kwargs.get('base_channels', 64)
+        lateral_inplanes = []
+        for i in range(kwargs.get('num_stages', 4)):
+            if expansion % 2 == 0:
+                planes = base_channels * (2 ** i) * \
+                         ((expansion // 2) ** (i > 0))
+            else:
+                planes = base_channels * (2**i) // (2**(i > 0))
+            if self.lateral and self.lateral_activate[i]:
+                if self.lateral_inv:
+                    lateral_inplane = planes * \
+                                      self.channel_ratio // self.lateral_infl
+                else:
+                    lateral_inplane = planes * \
+                                      self.lateral_infl // self.channel_ratio
+            else:
+                lateral_inplane = 0
+            lateral_inplanes.append(lateral_inplane)
+        self.lateral_inplanes = lateral_inplanes
 
     def inflate_weights(self, logger: MMLogger) -> None:
         """Inflate the resnet2d parameters to resnet3d pathway.
@@ -280,7 +272,7 @@ def _inflate_conv_params(self, conv3d: nn.Module,
             state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
             module_name_2d (str): The name of corresponding conv module in the
                 2d model.
-            inflated_param_names (List[str]): List of parameters that have been
+            inflated_param_names (list[str]): List of parameters that have been
                 inflated.
         """
         weight_2d_name = module_name_2d + '.weight'
@@ -358,11 +350,11 @@ def init_weights(self, pretrained: Optional[str] = None) -> None:
 }
 
 
-def build_pathway(cfg: ConfigType, *args, **kwargs) -> nn.Module:
+def build_pathway(cfg: Dict, *args, **kwargs) -> nn.Module:
     """Build pathway.
 
     Args:
-        cfg (dict or ConfigDict): cfg should contain:
+        cfg (dict): cfg should contain:
             - type (str): identify backbone type.
 
     Returns:
@@ -383,7 +375,7 @@ def build_pathway(cfg: ConfigType, *args, **kwargs) -> nn.Module:
 
 
 @MODELS.register_module()
-class ResNet3dSlowFast(nn.Module):
+class ResNet3dSlowFast(BaseModule):
     """Slowfast backbone.
 
     This module is proposed in `SlowFast Networks for Video Recognition
@@ -403,57 +395,43 @@ class ResNet3dSlowFast(nn.Module):
         channel_ratio (int): Reduce the channel number of fast pathway
             by ``channel_ratio``, corresponding to :math:`\\beta` in the paper.
             Defaults to 8.
-        slow_pathway (dict or ConfigDict): Configuration of slow branch, should
-            contain necessary arguments for building the specific type of
-            pathway and:
-                type (str): type of backbone the pathway bases on.
-                lateral (bool): determine whether to build lateral connection
-            for the pathway. Defaults to
-
-            .. code-block:: Python
-
-                dict(type='ResNetPathway',
-                lateral=True, depth=50, pretrained=None,
-                conv1_kernel=(1, 7, 7), dilations=(1, 1, 1, 1),
-                conv1_stride_t=1, pool1_stride_t=1, inflate=(0, 0, 1, 1))
-
-        fast_pathway (dict or ConfigDict): Configuration of fast branch,
-            similar to ``slow_pathway``. Defaults to
-
-            .. code-block:: Python
-
-                dict(type='ResNetPathway',
-                lateral=False, depth=50, pretrained=None, base_channels=8,
-                conv1_kernel=(5, 7, 7), conv1_stride_t=1, pool1_stride_t=1)
+        slow_pathway (dict): Configuration of slow branch. Defaults to
+            ``dict(type='resnet3d', depth=50, pretrained=None, lateral=True,
+            conv1_kernel=(1, 7, 7), conv1_stride_t=1, pool1_stride_t=1,
+            inflate=(0, 0, 1, 1))``.
+        fast_pathway (dict): Configuration of fast branch. Defaults to
+            ``dict(type='resnet3d', depth=50, pretrained=None, lateral=False,
+            base_channels=8, conv1_kernel=(5, 7, 7), conv1_stride_t=1,
+            pool1_stride_t=1)``.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
     """
 
-    def __init__(
-        self,
-        pretrained,
-        resample_rate: int = 8,
-        speed_ratio: int = 8,
-        channel_ratio: int = 8,
-        slow_pathway: ConfigType = dict(
-            type='resnet3d',
-            depth=50,
-            pretrained=None,
-            lateral=True,
-            conv1_kernel=(1, 7, 7),
-            dilations=(1, 1, 1, 1),
-            conv1_stride_t=1,
-            pool1_stride_t=1,
-            inflate=(0, 0, 1, 1)),
-        fast_pathway: ConfigType = dict(
-            type='resnet3d',
-            depth=50,
-            pretrained=None,
-            lateral=False,
-            base_channels=8,
-            conv1_kernel=(5, 7, 7),
-            conv1_stride_t=1,
-            pool1_stride_t=1)
-    ) -> None:
-        super().__init__()
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 resample_rate: int = 8,
+                 speed_ratio: int = 8,
+                 channel_ratio: int = 8,
+                 slow_pathway: Dict = dict(
+                     type='resnet3d',
+                     depth=50,
+                     pretrained=None,
+                     lateral=True,
+                     conv1_kernel=(1, 7, 7),
+                     conv1_stride_t=1,
+                     pool1_stride_t=1,
+                     inflate=(0, 0, 1, 1)),
+                 fast_pathway: Dict = dict(
+                     type='resnet3d',
+                     depth=50,
+                     pretrained=None,
+                     lateral=False,
+                     base_channels=8,
+                     conv1_kernel=(5, 7, 7),
+                     conv1_stride_t=1,
+                     pool1_stride_t=1),
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
         self.pretrained = pretrained
         self.resample_rate = resample_rate
         self.speed_ratio = speed_ratio
@@ -485,15 +463,15 @@ def init_weights(self, pretrained: Optional[str] = None) -> None:
         else:
             raise TypeError('pretrained must be a str or None')
 
-    def forward(self, x: Tensor) -> tuple:
+    def forward(self, x: torch.Tensor) -> tuple:
         """Defines the computation performed at every call.
 
         Args:
-            x (Tensor): The input data.
+            x (torch.Tensor): The input data.
 
         Returns:
-            Tuple[Tensor]: The feature of the input samples extracted
-                by the backbone.
+            tuple[torch.Tensor]: The feature of the input samples
+                extracted by the backbone.
         """
         x_slow = nn.functional.interpolate(
             x,
diff --git a/mmaction/models/backbones/resnet3d_slowonly.py b/mmaction/models/backbones/resnet3d_slowonly.py
index 819063c0cd..3a2a3a3ac0 100644
--- a/mmaction/models/backbones/resnet3d_slowonly.py
+++ b/mmaction/models/backbones/resnet3d_slowonly.py
@@ -16,8 +16,6 @@ class ResNet3dSlowOnly(ResNet3dPathway):
     """SlowOnly backbone based on ResNet3dPathway.
 
     Args:
-        lateral (bool): Determines whether to enable the lateral connection
-            from another pathway. Defaults to False.
         conv1_kernel (Sequence[int]): Kernel size of the first conv layer.
             Defaults to ``(1, 7, 7)``.
         conv1_stride_t (int): Temporal stride of the first conv layer.
@@ -30,8 +28,6 @@ class ResNet3dSlowOnly(ResNet3dPathway):
     """
 
     def __init__(self,
-                 *args,
-                 lateral: bool = False,
                  conv1_kernel: Sequence[int] = (1, 7, 7),
                  conv1_stride_t: int = 1,
                  pool1_stride_t: int = 1,
@@ -39,8 +35,6 @@ def __init__(self,
                  with_pool2: bool = False,
                  **kwargs) -> None:
         super().__init__(
-            *args,
-            lateral=lateral,
             conv1_kernel=conv1_kernel,
             conv1_stride_t=conv1_stride_t,
             pool1_stride_t=pool1_stride_t,
diff --git a/mmaction/models/backbones/rgbposeconv3d.py b/mmaction/models/backbones/rgbposeconv3d.py
new file mode 100644
index 0000000000..6f54e3b6b5
--- /dev/null
+++ b/mmaction/models/backbones/rgbposeconv3d.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmengine.logging import MMLogger, print_log
+from mmengine.model import BaseModule
+from mmengine.model.weight_init import constant_init, kaiming_init
+from mmengine.runner.checkpoint import load_checkpoint
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.registry import MODELS
+from .resnet3d_slowfast import ResNet3dPathway
+
+
+@MODELS.register_module()
+class RGBPoseConv3D(BaseModule):
+    """RGBPoseConv3D backbone.
+
+    Args:
+        pretrained (str): The file path to a pretrained model.
+            Defaults to None.
+        speed_ratio (int): Speed ratio indicating the ratio between time
+            dimension of the fast and slow pathway, corresponding to the
+            :math:`\\alpha` in the paper. Defaults to 4.
+        channel_ratio (int): Reduce the channel number of fast pathway
+            by ``channel_ratio``, corresponding to :math:`\\beta` in the paper.
+            Defaults to 4.
+        rgb_detach (bool): Whether to detach the gradients from the pose path.
+            Defaults to False.
+        pose_detach (bool): Whether to detach the gradients from the rgb path.
+            Defaults to False.
+        rgb_drop_path (float): The drop rate for dropping the features from
+            the pose path. Defaults to 0.
+        pose_drop_path (float): The drop rate for dropping the features from
+            the rgb path. Defaults to 0.
+        rgb_pathway (dict): Configuration of rgb branch. Defaults to
+            ``dict(num_stages=4, lateral=True, lateral_infl=1,
+            lateral_activate=(0, 0, 1, 1), fusion_kernel=7, base_channels=64,
+            conv1_kernel=(1, 7, 7), inflate=(0, 0, 1, 1), with_pool2=False)``.
+        pose_pathway (dict): Configuration of pose branch. Defaults to
+            ``dict(num_stages=3, stage_blocks=(4, 6, 3), lateral=True,
+            lateral_inv=True, lateral_infl=16, lateral_activate=(0, 1, 1),
+            fusion_kernel=7, in_channels=17, base_channels=32,
+            out_indices=(2, ), conv1_kernel=(1, 7, 7), conv1_stride_s=1,
+            conv1_stride_t=1, pool1_stride_s=1, pool1_stride_t=1,
+            inflate=(0, 1, 1), spatial_strides=(2, 2, 2),
+            temporal_strides=(1, 1, 1), with_pool2=False)``.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 pretrained: Optional[str] = None,
+                 speed_ratio: int = 4,
+                 channel_ratio: int = 4,
+                 rgb_detach: bool = False,
+                 pose_detach: bool = False,
+                 rgb_drop_path: float = 0,
+                 pose_drop_path: float = 0,
+                 rgb_pathway: Dict = dict(
+                     num_stages=4,
+                     lateral=True,
+                     lateral_infl=1,
+                     lateral_activate=(0, 0, 1, 1),
+                     fusion_kernel=7,
+                     base_channels=64,
+                     conv1_kernel=(1, 7, 7),
+                     inflate=(0, 0, 1, 1),
+                     with_pool2=False),
+                 pose_pathway: Dict = dict(
+                     num_stages=3,
+                     stage_blocks=(4, 6, 3),
+                     lateral=True,
+                     lateral_inv=True,
+                     lateral_infl=16,
+                     lateral_activate=(0, 1, 1),
+                     fusion_kernel=7,
+                     in_channels=17,
+                     base_channels=32,
+                     out_indices=(2, ),
+                     conv1_kernel=(1, 7, 7),
+                     conv1_stride_s=1,
+                     conv1_stride_t=1,
+                     pool1_stride_s=1,
+                     pool1_stride_t=1,
+                     inflate=(0, 1, 1),
+                     spatial_strides=(2, 2, 2),
+                     temporal_strides=(1, 1, 1),
+                     dilations=(1, 1, 1),
+                     with_pool2=False),
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.pretrained = pretrained
+        self.speed_ratio = speed_ratio
+        self.channel_ratio = channel_ratio
+
+        if rgb_pathway['lateral']:
+            rgb_pathway['speed_ratio'] = speed_ratio
+            rgb_pathway['channel_ratio'] = channel_ratio
+
+        if pose_pathway['lateral']:
+            pose_pathway['speed_ratio'] = speed_ratio
+            pose_pathway['channel_ratio'] = channel_ratio
+
+        self.rgb_path = ResNet3dPathway(**rgb_pathway)
+        self.pose_path = ResNet3dPathway(**pose_pathway)
+        self.rgb_detach = rgb_detach
+        self.pose_detach = pose_detach
+        assert 0 <= rgb_drop_path <= 1
+        assert 0 <= pose_drop_path <= 1
+        self.rgb_drop_path = rgb_drop_path
+        self.pose_drop_path = pose_drop_path
+
+    def init_weights(self) -> None:
+        """Initiate the parameters either from existing checkpoint or from
+        scratch."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                kaiming_init(m)
+            elif isinstance(m, _BatchNorm):
+                constant_init(m, 1)
+
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            msg = f'load model from: {self.pretrained}'
+            print_log(msg, logger=logger)
+            load_checkpoint(self, self.pretrained, strict=True, logger=logger)
+        elif self.pretrained is None:
+            # Init two branch separately.
+            self.rgb_path.init_weights()
+            self.pose_path.init_weights()
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, imgs: torch.Tensor, heatmap_imgs: torch.Tensor) -> tuple:
+        """Defines the computation performed at every call.
+
+        Args:
+            imgs (torch.Tensor): The input data.
+            heatmap_imgs (torch.Tensor): The input data.
+
+        Returns:
+            tuple[torch.Tensor]: The feature of the input
+            samples extracted by the backbone.
+        """
+        if self.training:
+            rgb_drop_path = torch.rand(1) < self.rgb_drop_path
+            pose_drop_path = torch.rand(1) < self.pose_drop_path
+        else:
+            rgb_drop_path, pose_drop_path = False, False
+        # We assume base_channel for RGB and Pose are 64 and 32.
+        x_rgb = self.rgb_path.conv1(imgs)
+        x_rgb = self.rgb_path.maxpool(x_rgb)
+        # N x 64 x 8 x 56 x 56
+        x_pose = self.pose_path.conv1(heatmap_imgs)
+        x_pose = self.pose_path.maxpool(x_pose)
+
+        x_rgb = self.rgb_path.layer1(x_rgb)
+        x_rgb = self.rgb_path.layer2(x_rgb)
+        x_pose = self.pose_path.layer1(x_pose)
+
+        if hasattr(self.rgb_path, 'layer2_lateral'):
+            feat = x_pose.detach() if self.rgb_detach else x_pose
+            x_pose_lateral = self.rgb_path.layer2_lateral(feat)
+            if rgb_drop_path:
+                x_pose_lateral = x_pose_lateral.new_zeros(x_pose_lateral.shape)
+
+        if hasattr(self.pose_path, 'layer1_lateral'):
+            feat = x_rgb.detach() if self.pose_detach else x_rgb
+            x_rgb_lateral = self.pose_path.layer1_lateral(feat)
+            if pose_drop_path:
+                x_rgb_lateral = x_rgb_lateral.new_zeros(x_rgb_lateral.shape)
+
+        if hasattr(self.rgb_path, 'layer2_lateral'):
+            x_rgb = torch.cat((x_rgb, x_pose_lateral), dim=1)
+
+        if hasattr(self.pose_path, 'layer1_lateral'):
+            x_pose = torch.cat((x_pose, x_rgb_lateral), dim=1)
+
+        x_rgb = self.rgb_path.layer3(x_rgb)
+        x_pose = self.pose_path.layer2(x_pose)
+
+        if hasattr(self.rgb_path, 'layer3_lateral'):
+            feat = x_pose.detach() if self.rgb_detach else x_pose
+            x_pose_lateral = self.rgb_path.layer3_lateral(feat)
+            if rgb_drop_path:
+                x_pose_lateral = x_pose_lateral.new_zeros(x_pose_lateral.shape)
+
+        if hasattr(self.pose_path, 'layer2_lateral'):
+            feat = x_rgb.detach() if self.pose_detach else x_rgb
+            x_rgb_lateral = self.pose_path.layer2_lateral(feat)
+            if pose_drop_path:
+                x_rgb_lateral = x_rgb_lateral.new_zeros(x_rgb_lateral.shape)
+
+        if hasattr(self.rgb_path, 'layer3_lateral'):
+            x_rgb = torch.cat((x_rgb, x_pose_lateral), dim=1)
+
+        if hasattr(self.pose_path, 'layer2_lateral'):
+            x_pose = torch.cat((x_pose, x_rgb_lateral), dim=1)
+
+        x_rgb = self.rgb_path.layer4(x_rgb)
+        x_pose = self.pose_path.layer3(x_pose)
+
+        return x_rgb, x_pose
diff --git a/mmaction/models/data_preprocessors/__init__.py b/mmaction/models/data_preprocessors/__init__.py
index feccb87e2b..241f9b901a 100644
--- a/mmaction/models/data_preprocessors/__init__.py
+++ b/mmaction/models/data_preprocessors/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .data_preprocessor import ActionDataPreprocessor
+from .multimodal_data_preprocessor import MultiModalDataPreprocessor
 
-__all__ = ['ActionDataPreprocessor']
+__all__ = ['ActionDataPreprocessor', 'MultiModalDataPreprocessor']
diff --git a/mmaction/models/data_preprocessors/data_preprocessor.py b/mmaction/models/data_preprocessors/data_preprocessor.py
index d2641bb6ab..5a11eefd3b 100644
--- a/mmaction/models/data_preprocessors/data_preprocessor.py
+++ b/mmaction/models/data_preprocessors/data_preprocessor.py
@@ -1,10 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Sequence, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, Union
 
 import torch
 from mmengine.model import BaseDataPreprocessor, stack_batch
 
 from mmaction.registry import MODELS
+from mmaction.utils.typing import SampleList
 
 
 @MODELS.register_module()
@@ -12,13 +13,10 @@ class ActionDataPreprocessor(BaseDataPreprocessor):
     """Data pre-processor for action recognition tasks.
 
     Args:
-        mean (Sequence[float or int, optional): The pixel mean of channels
+        mean (Sequence[float or int], optional): The pixel mean of channels
             of images or stacked optical flow. Defaults to None.
         std (Sequence[float or int], optional): The pixel standard deviation
             of channels of images or stacked optical flow. Defaults to None.
-        pad_size_divisor (int): The size of padded image should be
-            divisible by ``pad_size_divisor``. Defaults to 1.
-        pad_value (float or int): The padded pixel value. Defaults to 0.
         to_rgb (bool): Whether to convert image from BGR to RGB.
             Defaults to False.
         blending (dict, optional): Config for batch blending.
@@ -30,14 +28,10 @@ class ActionDataPreprocessor(BaseDataPreprocessor):
     def __init__(self,
                  mean: Optional[Sequence[Union[float, int]]] = None,
                  std: Optional[Sequence[Union[float, int]]] = None,
-                 pad_size_divisor: int = 1,
-                 pad_value: Union[float, int] = 0,
                  to_rgb: bool = False,
                  blending: Optional[dict] = None,
                  format_shape: str = 'NCHW') -> None:
         super().__init__()
-        self.pad_size_divisor = pad_size_divisor
-        self.pad_value = pad_value
         self.to_rgb = to_rgb
         self.format_shape = format_shape
 
@@ -49,7 +43,7 @@ def __init__(self,
             self._enable_normalize = True
             if self.format_shape == 'NCHW':
                 normalizer_shape = (-1, 1, 1)
-            elif self.format_shape in ['NCTHW', 'NCTVM', 'MIX2d3d']:
+            elif self.format_shape in ['NCTHW', 'MIX2d3d']:
                 normalizer_shape = (-1, 1, 1, 1)
             else:
                 raise ValueError(f'Invalid format shape: {format_shape}')
@@ -81,21 +75,21 @@ def forward(self,
             training (bool): Whether to enable training time augmentation.
 
         Returns:
-            dict or Tuple[dict]: Data in the same format as the model
-                input.
+            dict or Tuple[dict]: Data in the same format as the model input.
         """
+        data = self.cast_data(data)
         if isinstance(data, dict):
-            return self.forward_onesample(data, training)
+            return self.forward_onesample(data, training=training)
         elif isinstance(data, tuple):
             outputs = []
             for data_sample in data:
-                output = self.forward_onesample(data_sample, training)
+                output = self.forward_onesample(data_sample, training=training)
                 outputs.append(output)
             return tuple(outputs)
         else:
-            raise TypeError('Unsupported data type for `data`!')
+            raise TypeError(f'Unsupported data type: {type(data)}!')
 
-    def forward_onesample(self, data: dict, training: bool = False) -> dict:
+    def forward_onesample(self, data, training: bool = False) -> dict:
         """Perform normalization, padding, bgr2rgb conversion and batch
         augmentation on one data sample.
 
@@ -107,12 +101,18 @@ def forward_onesample(self, data: dict, training: bool = False) -> dict:
             dict: Data in the same format as the model
                 input.
         """
-        data = self.cast_data(data)
         inputs, data_samples = data['inputs'], data['data_samples']
+        inputs, data_samples = self.preprocess(inputs, data_samples, training)
+        data['inputs'] = inputs
+        data['data_samples'] = data_samples
+        return data
 
+    def preprocess(self,
+                   inputs: List[torch.Tensor],
+                   data_samples: SampleList,
+                   training: bool = False) -> Tuple:
         # --- Pad and stack --
-        batch_inputs = stack_batch(inputs, self.pad_size_divisor,
-                                   self.pad_value)
+        batch_inputs = stack_batch(inputs)
 
         if self.format_shape == 'MIX2d3d':
             if batch_inputs.ndim == 4:
@@ -147,5 +147,4 @@ def forward_onesample(self, data: dict, training: bool = False) -> dict:
             batch_inputs, data_samples = self.blending(batch_inputs,
                                                        data_samples)
 
-        data['inputs'] = batch_inputs
-        return data
+        return batch_inputs, data_samples
diff --git a/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py b/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py
new file mode 100644
index 0000000000..1353c811d4
--- /dev/null
+++ b/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+from mmengine.model import BaseDataPreprocessor, ModuleDict
+
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class MultiModalDataPreprocessor(BaseDataPreprocessor):
+    """Multi-Modal data pre-processor for action recognition tasks."""
+
+    def __init__(self, preprocessors: Dict) -> None:
+        super().__init__()
+        self.preprocessors = ModuleDict()
+        for name, pre_cfg in preprocessors.items():
+            assert 'type' in pre_cfg, (
+                'Each data preprocessor should contain the key type, '
+                f'but got {pre_cfg}')
+            self.preprocessors[name] = MODELS.build(pre_cfg)
+
+    def forward(self, data: Dict, training: bool = False) -> Dict:
+        """Preprocesses the data into the model input format.
+
+        Args:
+            data (dict): Data returned by dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        data = self.cast_data(data)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        for modality, modality_data in inputs.items():
+            preprocessor = self.preprocessors[modality]
+            modality_data, data_samples = preprocessor.preprocess(
+                modality_data, data_samples, training)
+            inputs[modality] = modality_data
+
+        data['inputs'] = inputs
+        data['data_samples'] = data_samples
+        return data
diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py
index 964f7b45e4..4cc8d20a4d 100644
--- a/mmaction/models/heads/__init__.py
+++ b/mmaction/models/heads/__init__.py
@@ -4,6 +4,7 @@
 from .i3d_head import I3DHead
 from .mvit_head import MViTHead
 from .omni_head import OmniHead
+from .rgbpose_head import RGBPoseHead
 from .slowfast_head import SlowFastHead
 from .timesformer_head import TimeSformerHead
 from .tpn_head import TPNHead
@@ -16,5 +17,5 @@
 __all__ = [
     'BaseHead', 'GCNHead', 'I3DHead', 'MViTHead', 'OmniHead', 'SlowFastHead',
     'TPNHead', 'TRNHead', 'TSMHead', 'TSNAudioHead', 'TSNHead',
-    'TimeSformerHead', 'X3DHead'
+    'TimeSformerHead', 'X3DHead', 'RGBPoseHead'
 ]
diff --git a/mmaction/models/heads/base.py b/mmaction/models/heads/base.py
index 10ceae3dbb..8eafdc2cf2 100644
--- a/mmaction/models/heads/base.py
+++ b/mmaction/models/heads/base.py
@@ -1,18 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
-from typing import Tuple, Union
+from typing import Dict, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from mmengine.model import BaseModule
 from mmengine.structures import LabelData
-from torch import Tensor
 
 from mmaction.evaluation import top_k_accuracy
 from mmaction.registry import MODELS
-from mmaction.utils import (ConfigType, LabelList, OptConfigType,
-                            OptMultiConfig, SampleList)
+from mmaction.utils import ForwardResults, SampleList
 
 
 class AvgConsensus(nn.Module):
@@ -20,14 +18,14 @@ class AvgConsensus(nn.Module):
 
     Args:
         dim (int): Decide which dim consensus function to apply.
-            Default: 1.
+            Defaults to 1.
     """
 
     def __init__(self, dim: int = 1) -> None:
         super().__init__()
         self.dim = dim
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Defines the computation performed at every call."""
         return x.mean(dim=self.dim, keepdim=True)
 
@@ -37,35 +35,34 @@ class BaseHead(BaseModule, metaclass=ABCMeta):
 
     All Head should subclass it.
     All subclass should overwrite:
-    - :meth:`init_weights`, initializing weights in some modules.
     - :meth:`forward`, supporting to forward both for training and testing.
 
     Args:
         num_classes (int): Number of classes to be classified.
         in_channels (int): Number of channels in input feature.
-        loss_cls (dict or ConfigDict): Config for building loss.
-            Default: dict(type='CrossEntropyLoss', loss_weight=1.0).
+        loss_cls (dict): Config for building loss.
+            Defaults to ``dict(type='CrossEntropyLoss', loss_weight=1.0)``.
         multi_class (bool): Determines whether it is a multi-class
-            recognition task. Default: False.
+            recognition task. Defaults to False.
         label_smooth_eps (float): Epsilon used in label smooth.
-            Reference: arxiv.org/abs/1906.02629. Default: 0.
-        topk (int or tuple): Top-k accuracy. Default: (1, 5).
-        average_clips (dict or ConfigDict, optional): Config for
-            averaging class scores over multiple clips. Default: None.
-        init_cfg (dict or ConfigDict, optional): Config to control the
-           initialization. Defaults to None.
+            Reference: arxiv.org/abs/1906.02629. Defaults to 0.
+        topk (int or tuple): Top-k accuracy. Defaults to ``(1, 5)``.
+        average_clips (dict, optional): Config for averaging class
+            scores over multiple clips. Defaults to None.
+        init_cfg (dict, optional): Config to control the initialization.
+            Defaults to None.
     """
 
     def __init__(self,
                  num_classes: int,
                  in_channels: int,
-                 loss_cls: ConfigType = dict(
+                 loss_cls: Dict = dict(
                      type='CrossEntropyLoss', loss_weight=1.0),
                  multi_class: bool = False,
                  label_smooth_eps: float = 0.0,
                  topk: Union[int, Tuple[int]] = (1, 5),
-                 average_clips: OptConfigType = None,
-                 init_cfg: OptMultiConfig = None) -> None:
+                 average_clips: Optional[Dict] = None,
+                 init_cfg: Optional[Dict] = None) -> None:
         super(BaseHead, self).__init__(init_cfg=init_cfg)
         self.num_classes = num_classes
         self.in_channels = in_channels
@@ -81,18 +78,19 @@ def __init__(self,
         self.topk = topk
 
     @abstractmethod
-    def forward(self, x, **kwargs) -> Tensor:
+    def forward(self, x, **kwargs) -> ForwardResults:
         """Defines the computation performed at every call."""
         raise NotImplementedError
 
-    def loss(self, feats: Union[Tensor, Tuple[Tensor]],
-             data_samples: SampleList, **kwargs) -> dict:
+    def loss(self, feats: Union[torch.Tensor, Tuple[torch.Tensor]],
+             data_samples: SampleList, **kwargs) -> Dict:
         """Perform forward propagation of head and loss calculation on the
         features of the upstream network.
 
         Args:
-            feats (Tensor or Tuple[Tensor]): Features from upstream network.
-            data_samples (List[:obj:`ActionDataSample`]): The batch
+            feats (torch.Tensor | tuple[torch.Tensor]): Features from
+                upstream network.
+            data_samples (list[:obj:`ActionDataSample`]): The batch
                 data samples.
 
         Returns:
@@ -101,14 +99,14 @@ def loss(self, feats: Union[Tensor, Tuple[Tensor]],
         cls_scores = self(feats, **kwargs)
         return self.loss_by_feat(cls_scores, data_samples)
 
-    def loss_by_feat(self, cls_scores: Union[Tensor, Tuple[Tensor]],
-                     data_samples: SampleList) -> dict:
+    def loss_by_feat(self, cls_scores: torch.Tensor,
+                     data_samples: SampleList) -> Dict:
         """Calculate the loss based on the features extracted by the head.
 
         Args:
-            cls_scores (Tensor): Classification prediction results of
+            cls_scores (torch.Tensor): Classification prediction results of
                 all class, has shape (batch_size, num_classes).
-            data_samples (List[:obj:`ActionDataSample`]): The batch
+            data_samples (list[:obj:`ActionDataSample`]): The batch
                 data samples.
 
         Returns:
@@ -149,32 +147,33 @@ def loss_by_feat(self, cls_scores: Union[Tensor, Tuple[Tensor]],
             losses['loss_cls'] = loss_cls
         return losses
 
-    def predict(self, feats: Union[Tensor, Tuple[Tensor]],
-                data_samples: SampleList, **kwargs) -> LabelList:
+    def predict(self, feats: Union[torch.Tensor, Tuple[torch.Tensor]],
+                data_samples: SampleList, **kwargs) -> SampleList:
         """Perform forward propagation of head and predict recognition results
         on the features of the upstream network.
 
         Args:
-            feats (Tensor or Tuple[Tensor]): Features from upstream network.
-            data_samples (List[:obj:`ActionDataSample`]): The batch
+            feats (torch.Tensor | tuple[torch.Tensor]): Features from
+                upstream network.
+            data_samples (list[:obj:`ActionDataSample`]): The batch
                 data samples.
 
         Returns:
-             List[:obj:`ActionDataSample`]: Recognition results wrapped
+             list[:obj:`ActionDataSample`]: Recognition results wrapped
                 by :obj:`ActionDataSample`.
         """
         cls_scores = self(feats, **kwargs)
         return self.predict_by_feat(cls_scores, data_samples)
 
-    def predict_by_feat(self, cls_scores: Tensor,
-                        data_samples: SampleList) -> LabelList:
+    def predict_by_feat(self, cls_scores: torch.Tensor,
+                        data_samples: SampleList) -> SampleList:
         """Transform a batch of output features extracted from the head into
         prediction results.
 
         Args:
-            cls_scores (Tensor): Classification scores, has a shape
-                    (num_classes, )
-            data_samples (List[:obj:`ActionDataSample`]): The
+            cls_scores (torch.Tensor): Classification scores, has a shape
+                (B*num_segs, num_classes)
+            data_samples (list[:obj:`ActionDataSample`]): The
                 annotation data of every samples. It usually includes
                 information such as `gt_labels`.
 
@@ -186,15 +185,17 @@ def predict_by_feat(self, cls_scores: Tensor,
         cls_scores = self.average_clip(cls_scores, num_segs=num_segs)
         pred_labels = cls_scores.argmax(dim=-1, keepdim=True).detach()
 
-        for data_sample, score, pred_lable in zip(data_samples, cls_scores,
+        for data_sample, score, pred_label in zip(data_samples, cls_scores,
                                                   pred_labels):
             prediction = LabelData(item=score)
-            pred_label = LabelData(item=pred_lable)
+            pred_label = LabelData(item=pred_label)
             data_sample.pred_scores = prediction
             data_sample.pred_labels = pred_label
         return data_samples
 
-    def average_clip(self, cls_scores: Tensor, num_segs: int = 1) -> Tensor:
+    def average_clip(self,
+                     cls_scores: torch.Tensor,
+                     num_segs: int = 1) -> torch.Tensor:
         """Averaging class scores over multiple clips.
 
         Using different averaging types ('score' or 'prob' or None,
@@ -202,11 +203,11 @@ def average_clip(self, cls_scores: Tensor, num_segs: int = 1) -> Tensor:
         class score. Only called in test mode.
 
         Args:
-            cls_scores (Tensor): Class scores to be averaged.
+            cls_scores (torch.Tensor): Class scores to be averaged.
             num_segs (int): Number of clips for each input sample.
 
         Returns:
-            Tensor: Averaged class scores.
+            torch.Tensor: Averaged class scores.
         """
 
         if self.average_clips not in ['score', 'prob', None]:
diff --git a/mmaction/models/heads/rgbpose_head.py b/mmaction/models/heads/rgbpose_head.py
new file mode 100644
index 0000000000..69da4efed9
--- /dev/null
+++ b/mmaction/models/heads/rgbpose_head.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model.weight_init import normal_init
+from mmengine.structures import LabelData
+
+from mmaction.evaluation import top_k_accuracy
+from mmaction.registry import MODELS
+from mmaction.utils import SampleList
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class RGBPoseHead(BaseHead):
+    """The classification head for RGBPoseConv3D.
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (tuple[int]): Number of channels in input feature.
+        loss_cls (dict): Config for building loss.
+            Defaults to ``dict(type='CrossEntropyLoss')``.
+        loss_components (list[str]): The components of the loss.
+            Defaults to ``['rgb', 'pose']``.
+        loss_weights (float or tuple[float]): The weights of the losses.
+            Defaults to 1.
+        dropout (float): Probability of dropout layer. Default: 0.5.
+        init_std (float): Std value for Initiation. Default: 0.01.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Tuple[int],
+                 loss_cls: Dict = dict(type='CrossEntropyLoss'),
+                 loss_components: List[str] = ['rgb', 'pose'],
+                 loss_weights: Union[float, Tuple[float]] = 1.,
+                 dropout: float = 0.5,
+                 init_std: float = 0.01,
+                 **kwargs) -> None:
+        super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+        if isinstance(dropout, float):
+            dropout = {'rgb': dropout, 'pose': dropout}
+        assert isinstance(dropout, dict)
+
+        if loss_components is not None:
+            self.loss_components = loss_components
+            if isinstance(loss_weights, float):
+                loss_weights = [loss_weights] * len(loss_components)
+            assert len(loss_weights) == len(loss_components)
+            self.loss_weights = loss_weights
+
+        self.dropout = dropout
+        self.init_std = init_std
+
+        self.dropout_rgb = nn.Dropout(p=self.dropout['rgb'])
+        self.dropout_pose = nn.Dropout(p=self.dropout['pose'])
+
+        self.fc_rgb = nn.Linear(self.in_channels[0], num_classes)
+        self.fc_pose = nn.Linear(self.in_channels[1], num_classes)
+        self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        normal_init(self.fc_rgb, std=self.init_std)
+        normal_init(self.fc_pose, std=self.init_std)
+
+    def forward(self, x: Tuple[torch.Tensor]) -> Dict:
+        """Defines the computation performed at every call."""
+        x_rgb, x_pose = self.avg_pool(x[0]), self.avg_pool(x[1])
+        x_rgb = x_rgb.view(x_rgb.size(0), -1)
+        x_pose = x_pose.view(x_pose.size(0), -1)
+
+        x_rgb = self.dropout_rgb(x_rgb)
+        x_pose = self.dropout_pose(x_pose)
+
+        cls_scores = dict()
+        cls_scores['rgb'] = self.fc_rgb(x_rgb)
+        cls_scores['pose'] = self.fc_pose(x_pose)
+
+        return cls_scores
+
+    def loss(self, feats: Tuple[torch.Tensor], data_samples: SampleList,
+             **kwargs) -> Dict:
+        """Perform forward propagation of head and loss calculation on the
+        features of the upstream network.
+
+        Args:
+            feats (tuple[torch.Tensor]): Features from upstream network.
+            data_samples (list[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        cls_scores = self(feats, **kwargs)
+        return self.loss_by_feat(cls_scores, data_samples)
+
+    def loss_by_feat(self, cls_scores: Dict[str, torch.Tensor],
+                     data_samples: SampleList) -> Dict:
+        """Calculate the loss based on the features extracted by the head.
+
+        Args:
+            cls_scores (dict[str, torch.Tensor]): The dict of
+                classification scores,
+            data_samples (list[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        labels = torch.stack([x.gt_labels.item for x in data_samples])
+        labels = labels.squeeze()
+
+        if labels.shape == torch.Size([]):
+            labels = labels.unsqueeze(0)
+        elif labels.dim() == 1 and labels.size()[0] == self.num_classes \
+                and cls_scores.size()[0] == 1:
+            # Fix a bug when training with soft labels and batch size is 1.
+            # When using soft labels, `labels` and `cls_score` share the same
+            # shape.
+            labels = labels.unsqueeze(0)
+
+        losses = dict()
+        for loss_name, weight in zip(self.loss_components, self.loss_weights):
+            cls_score = cls_scores[loss_name]
+            loss_cls = self.loss_by_scores(cls_score, labels)
+            loss_cls = {loss_name + '_' + k: v for k, v in loss_cls.items()}
+            loss_cls[f'{loss_name}_loss_cls'] *= weight
+            losses.update(loss_cls)
+        return losses
+
+    def loss_by_scores(self, cls_scores: torch.Tensor,
+                       labels: torch.Tensor) -> Dict:
+        """Calculate the loss based on the features extracted by the head.
+
+        Args:
+            cls_scores (torch.Tensor): Classification prediction
+                results of all class, has shape (batch_size, num_classes).
+            labels (torch.Tensor): The labels used to calculate the loss.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        losses = dict()
+        if cls_scores.size() != labels.size():
+            top_k_acc = top_k_accuracy(cls_scores.detach().cpu().numpy(),
+                                       labels.detach().cpu().numpy(),
+                                       self.topk)
+            for k, a in zip(self.topk, top_k_acc):
+                losses[f'top{k}_acc'] = torch.tensor(
+                    a, device=cls_scores.device)
+        if self.label_smooth_eps != 0:
+            if cls_scores.size() != labels.size():
+                labels = F.one_hot(labels, num_classes=self.num_classes)
+            labels = ((1 - self.label_smooth_eps) * labels +
+                      self.label_smooth_eps / self.num_classes)
+
+        loss_cls = self.loss_cls(cls_scores, labels)
+        # loss_cls may be dictionary or single tensor
+        if isinstance(loss_cls, dict):
+            losses.update(loss_cls)
+        else:
+            losses['loss_cls'] = loss_cls
+        return losses
+
+    def predict(self, feats: Tuple[torch.Tensor], data_samples: SampleList,
+                **kwargs) -> SampleList:
+        """Perform forward propagation of head and predict recognition results
+        on the features of the upstream network.
+
+        Args:
+            feats (tuple[torch.Tensor]): Features from upstream network.
+            data_samples (list[:obj:`ActionDataSample`]): The batch
+                data samples.
+
+        Returns:
+             list[:obj:`ActionDataSample`]: Recognition results wrapped
+                by :obj:`ActionDataSample`.
+        """
+        cls_scores = self(feats, **kwargs)
+        return self.predict_by_feat(cls_scores, data_samples)
+
+    def predict_by_feat(self, cls_scores: Dict[str, torch.Tensor],
+                        data_samples: SampleList) -> SampleList:
+        """Transform a batch of output features extracted from the head into
+        prediction results.
+
+        Args:
+            cls_scores (dict[str, torch.Tensor]): The dict of
+                classification scores,
+            data_samples (list[:obj:`ActionDataSample`]): The
+                annotation data of every samples. It usually includes
+                information such as `gt_labels`.
+
+        Returns:
+            list[:obj:`ActionDataSample`]: Recognition results wrapped
+                by :obj:`ActionDataSample`.
+        """
+        pred_scores = [LabelData() for _ in range(len(data_samples))]
+        pred_labels = [LabelData() for _ in range(len(data_samples))]
+
+        for name in self.loss_components:
+            cls_score = cls_scores[name]
+            cls_score, pred_label = \
+                self.predict_by_scores(cls_score, data_samples)
+            for pred_score, pred_label, score, label in zip(
+                    pred_scores, pred_labels, cls_score, pred_label):
+                pred_score.set_data({f'{name}': score})
+                pred_label.set_data({f'{name}': label})
+
+        for data_sample, pred_score, pred_label in zip(data_samples,
+                                                       pred_scores,
+                                                       pred_labels):
+            data_sample.pred_scores = pred_score
+            data_sample.pred_labels = pred_label
+
+        return data_samples
+
+    def predict_by_scores(self, cls_scores: torch.Tensor,
+                          data_samples: SampleList) -> Tuple:
+        """Transform a batch of output features extracted from the head into
+        prediction results.
+
+        Args:
+            cls_scores (torch.Tensor): Classification scores, has a shape
+                (B*num_segs, num_classes)
+            data_samples (list[:obj:`ActionDataSample`]): The annotation
+                data of every samples.
+
+        Returns:
+            tuple: A tuple of the averaged classification scores and
+                prediction labels.
+        """
+
+        num_segs = cls_scores.shape[0] // len(data_samples)
+        cls_scores = self.average_clip(cls_scores, num_segs=num_segs)
+        pred_labels = cls_scores.argmax(dim=-1, keepdim=True).detach()
+        return cls_scores, pred_labels
diff --git a/mmaction/models/recognizers/__init__.py b/mmaction/models/recognizers/__init__.py
index 1b7db21451..447f6333dc 100644
--- a/mmaction/models/recognizers/__init__.py
+++ b/mmaction/models/recognizers/__init__.py
@@ -2,11 +2,12 @@
 from .base import BaseRecognizer
 from .recognizer2d import Recognizer2D
 from .recognizer3d import Recognizer3D
+from .recognizer3d_mm import MMRecognizer3D
 from .recognizer_audio import RecognizerAudio
 from .recognizer_gcn import RecognizerGCN
 from .recognizer_omni import RecognizerOmni
 
 __all__ = [
     'BaseRecognizer', 'RecognizerGCN', 'Recognizer2D', 'Recognizer3D',
-    'RecognizerAudio', 'RecognizerOmni'
+    'RecognizerAudio', 'RecognizerOmni', 'MMRecognizer3D'
 ]
diff --git a/mmaction/models/recognizers/recognizer3d_mm.py b/mmaction/models/recognizers/recognizer3d_mm.py
new file mode 100644
index 0000000000..1d7099b3c3
--- /dev/null
+++ b/mmaction/models/recognizers/recognizer3d_mm.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+import torch
+
+from mmaction.registry import MODELS
+from mmaction.utils.typing import OptSampleList
+from .base import BaseRecognizer
+
+
+@MODELS.register_module()
+class MMRecognizer3D(BaseRecognizer):
+    """Multi-modal 3D recognizer model framework."""
+
+    def extract_feat(self,
+                     inputs: Dict[str, torch.Tensor],
+                     stage: str = 'backbone',
+                     data_samples: OptSampleList = None,
+                     test_mode: bool = False) -> Tuple:
+        """Extract features.
+
+        Args:
+            inputs (dict[str, torch.Tensor]): The multi-modal input data.
+            stage (str): Which stage to output the feature.
+                Defaults to ``'backbone'``.
+            data_samples (list[:obj:`ActionDataSample`], optional): Action data
+                samples, which are only needed in training. Defaults to None.
+            test_mode (bool): Whether in test mode. Defaults to False.
+
+        Returns:
+                tuple[torch.Tensor]: The extracted features.
+                dict: A dict recording the kwargs for downstream
+                    pipeline.
+        """
+        # [N, num_views, C, T, H, W] ->
+        # [N * num_views, C, T, H, W]
+        for m, m_data in inputs.items():
+            m_data = m_data.reshape((-1, ) + m_data.shape[2:])
+            inputs[m] = m_data
+
+        # Record the kwargs required by `loss` and `predict`
+        loss_predict_kwargs = dict()
+
+        x = self.backbone(**inputs)
+        if stage == 'backbone':
+            return x, loss_predict_kwargs
+
+        if self.with_cls_head and stage == 'head':
+            x = self.cls_head(x, **loss_predict_kwargs)
+            return x, loss_predict_kwargs
diff --git a/mmaction/models/utils/blending_utils.py b/mmaction/models/utils/blending_utils.py
index 64808d32f7..babea75d05 100644
--- a/mmaction/models/utils/blending_utils.py
+++ b/mmaction/models/utils/blending_utils.py
@@ -1,11 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta, abstractmethod
-from typing import Union
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import torch
 import torch.nn.functional as F
-from torch import Tensor
 from torch.distributions.beta import Beta
 
 from mmaction.registry import MODELS
@@ -25,38 +24,39 @@ def __init__(self, num_classes: int) -> None:
         self.num_classes = num_classes
 
     @abstractmethod
-    def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple:
+    def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+                    **kwargs) -> Tuple:
         """Blending images process."""
         raise NotImplementedError
 
-    def __call__(self, imgs: Tensor, batch_data_samples: SampleList,
-                 **kwargs) -> tuple:
+    def __call__(self, imgs: torch.Tensor, batch_data_samples: SampleList,
+                 **kwargs) -> Tuple:
         """Blending data in a mini-batch.
 
         Images are float tensors with the shape of (B, N, C, H, W) for 2D
         recognizers or (B, N, C, T, H, W) for 3D recognizers.
 
         Besides, labels are converted from hard labels to soft labels.
-        Hard labels are integer tensors with the shape of (B, 1) and all of the
+        Hard labels are integer tensors with the shape of (B, ) and all of the
         elements are in the range [0, num_classes - 1].
-        Soft labels (probablity distribution over classes) are float tensors
-        with the shape of (B, 1, num_classes) and all of the elements are in
+        Soft labels (probability distribution over classes) are float tensors
+        with the shape of (B, num_classes) and all of the elements are in
         the range [0, 1].
 
         Args:
-            imgs (Tensor): Model input images, float tensor with the
+            imgs (torch.Tensor): Model input images, float tensor with the
                 shape of (B, N, C, H, W) or (B, N, C, T, H, W).
             batch_data_samples (List[:obj:`ActionDataSample`]): The batch
                 data samples. It usually includes information such
                 as `gt_labels`.
 
         Returns:
-            mixed_imgs (Tensor): Blending images, float tensor with the
+            mixed_imgs (torch.Tensor): Blending images, float tensor with the
                 same shape of the input imgs.
             batch_data_samples (List[:obj:`ActionDataSample`]): The modified
                 batch data samples. ``gt_labels`` in each data sample are
                 converted from a hard label to a blended soft label, float
-                tensor with the shape of (1, num_classes) and all elements are
+                tensor with the shape of (num_classes, ) and all elements are
                 in range [0, 1].
         """
         label = [x.gt_labels.item for x in batch_data_samples]
@@ -90,13 +90,14 @@ def __init__(self, num_classes: int, alpha: float = .2) -> None:
         super().__init__(num_classes=num_classes)
         self.beta = Beta(alpha, alpha)
 
-    def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple:
+    def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+                    **kwargs) -> Tuple:
         """Blending images with mixup.
 
         Args:
-            imgs (Tensor): Model input images, float tensor with the
+            imgs (torch.Tensor): Model input images, float tensor with the
                 shape of (B, N, C, H, W) or (B, N, C, T, H, W).
-            label (Tensor): One hot labels, integer tensor with the shape
+            label (torch.Tensor): One hot labels, integer tensor with the shape
                 of (B, num_classes).
 
         Returns:
@@ -132,7 +133,7 @@ def __init__(self, num_classes: int, alpha: float = .2) -> None:
         self.beta = Beta(alpha, alpha)
 
     @staticmethod
-    def rand_bbox(img_size: torch.Size, lam: Tensor) -> tuple:
+    def rand_bbox(img_size: torch.Size, lam: torch.Tensor) -> Tuple:
         """Generate a random boudning box."""
         w = img_size[-1]
         h = img_size[-2]
@@ -151,13 +152,14 @@ def rand_bbox(img_size: torch.Size, lam: Tensor) -> tuple:
 
         return bbx1, bby1, bbx2, bby2
 
-    def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple:
+    def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+                    **kwargs) -> Tuple:
         """Blending images with cutmix.
 
         Args:
-            imgs (Tensor): Model input images, float tensor with the
+            imgs (torch.Tensor): Model input images, float tensor with the
                 shape of (B, N, C, H, W) or (B, N, C, T, H, W).
-            label (Tensor): One hot labels, integer tensor with the shape
+            label (torch.Tensor): One hot labels, integer tensor with the shape
                 of (B, num_classes).
 
         Returns:
@@ -209,7 +211,9 @@ class RandomBatchAugment(BaseMiniBatchBlending):
         and to do nothing is 0.2.
     """
 
-    def __init__(self, augments: Union[dict, list], probs=None):
+    def __init__(self,
+                 augments: Union[dict, list],
+                 probs: Optional[Union[float, List[float]]] = None) -> None:
         if not isinstance(augments, (tuple, list)):
             augments = [augments]
 
@@ -235,7 +239,8 @@ def __init__(self, augments: Union[dict, list], probs=None):
 
         self.probs = probs
 
-    def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple:
+    def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+                    **kwargs) -> Tuple:
         """Randomly apply batch augmentations to the batch inputs and batch
         data samples."""
         aug_index = np.random.choice(len(self.augments), p=self.probs)
diff --git a/tests/datasets/transforms/test_formating.py b/tests/datasets/transforms/test_formating.py
index 842d2dbf27..8e741c24e5 100644
--- a/tests/datasets/transforms/test_formating.py
+++ b/tests/datasets/transforms/test_formating.py
@@ -101,8 +101,8 @@ def test_repr(self):
             type='PackActionInputs', meta_keys=['flip_direction', 'img_shape'])
         transform = TRANSFORMS.build(cfg)
         self.assertEqual(
-            repr(transform),
-            "PackActionInputs(meta_keys=['flip_direction', 'img_shape'])")
+            repr(transform), 'PackActionInputs(collect_keys=None, '
+            "meta_keys=['flip_direction', 'img_shape'])")
 
 
 class TestPackLocalizationInputs(unittest.TestCase):
@@ -184,8 +184,24 @@ def test_format_shape():
     target_keys = ['imgs', 'input_shape']
     assert assert_dict_has_keys(results, target_keys)
 
-    assert repr(format_shape) == format_shape.__class__.__name__ + \
-        "(input_format='NCTHW')"
+    # `NCTHW` input format with imgs and heatmap_imgs
+    results = dict(
+        imgs=np.random.randn(6, 224, 224, 3),
+        heatmap_imgs=np.random.randn(12, 17, 56, 56),
+        num_clips=2,
+        clip_len=dict(RGB=3, Pose=6))
+
+    results = format_shape(results)
+    assert results['input_shape'] == (2, 3, 3, 224, 224)
+    assert results['heatmap_input_shape'] == (2, 17, 6, 56, 56)
+
+    assert repr(format_shape) == "FormatShape(input_format='NCTHW')"
+
+    # `NCTHW_Heatmap` input format
+    results = dict(
+        imgs=np.random.randn(12, 17, 56, 56), num_clips=2, clip_len=6)
+    format_shape = FormatShape('NCTHW_Heatmap')
+    assert format_shape(results)['input_shape'] == (2, 17, 6, 56, 56)
 
     # `NCHW_Flow` input format
     results = dict(imgs=np.random.randn(6, 224, 224), num_clips=1, clip_len=3)
diff --git a/tests/datasets/transforms/test_loading.py b/tests/datasets/transforms/test_loading.py
index 5413475a92..035a2213cc 100644
--- a/tests/datasets/transforms/test_loading.py
+++ b/tests/datasets/transforms/test_loading.py
@@ -260,21 +260,23 @@ def test_pims_decode(self):
             video_result['frame_inds']), 256, 340, 3)
 
     def test_decord_init(self):
-        target_keys = ['video_reader', 'total_frames']
+        target_keys = ['video_reader', 'total_frames', 'avg_fps']
         video_result = copy.deepcopy(self.video_results)
         decord_init = DecordInit()
         decord_init_result = decord_init(video_result)
         assert assert_dict_has_keys(decord_init_result, target_keys)
         assert decord_init_result['total_frames'] == len(
             decord_init_result['video_reader'])
+        assert decord_init_result['avg_fps'] == 30
+
         assert repr(decord_init) == (f'{decord_init.__class__.__name__}('
                                      f'io_backend=disk, '
-                                     f'num_threads={1})')
+                                     f'num_threads=1)')
 
     def test_decord_decode(self):
         target_keys = ['frame_inds', 'imgs', 'original_shape']
 
-        # test Decord with 2 dim input and start_index = 0
+        # test Decord with 2 dim input using accurate mode
         video_result = copy.deepcopy(self.video_results)
         video_result['frame_inds'] = np.arange(0, self.total_frames,
                                                3)[:, np.newaxis]
@@ -289,7 +291,7 @@ def test_decord_decode(self):
         assert np.shape(decord_decode_result['imgs']) == (len(
             video_result['frame_inds']), 256, 340, 3)
 
-        # test Decord with 1 dim input and start_index = 0
+        # test Decord with 1 dim input using accurate mode
         video_result = copy.deepcopy(self.video_results)
         video_result['frame_inds'] = np.arange(0, self.total_frames, 3)
         decord_init = DecordInit()
@@ -303,7 +305,7 @@ def test_decord_decode(self):
         assert np.shape(decord_decode_result['imgs']) == (len(
             video_result['frame_inds']), 256, 340, 3)
 
-        # test Decord with 2 dim input and start_index = 0
+        # test Decord with 2 dim input using efficient mode
         video_result = copy.deepcopy(self.video_results)
         video_result['frame_inds'] = np.arange(0, self.total_frames,
                                                3)[:, np.newaxis]
@@ -311,14 +313,14 @@ def test_decord_decode(self):
         decord_init_result = decord_init(video_result)
         video_result['video_reader'] = decord_init_result['video_reader']
 
-        decord_decode = DecordDecode()
+        decord_decode = DecordDecode(mode='efficient')
         decord_decode_result = decord_decode(video_result)
         assert assert_dict_has_keys(decord_decode_result, target_keys)
         assert decord_decode_result['original_shape'] == (256, 340)
         assert np.shape(decord_decode_result['imgs']) == (len(
             video_result['frame_inds']), 256, 340, 3)
 
-        # test Decord with 1 dim input
+        # test Decord with 1 dim input using efficient mode
         video_result = copy.deepcopy(self.video_results)
         video_result['frame_inds'] = np.arange(1, self.total_frames, 3)
         decord_init = DecordInit()
diff --git a/tests/datasets/transforms/test_pose_transforms.py b/tests/datasets/transforms/test_pose_transforms.py
index d65d450124..913447f938 100644
--- a/tests/datasets/transforms/test_pose_transforms.py
+++ b/tests/datasets/transforms/test_pose_transforms.py
@@ -13,10 +13,11 @@
 
 from mmaction.datasets.transforms import (GeneratePoseTarget, GenSkeFeat,
                                           JointToBone, LoadKineticsPose,
-                                          MergeSkeFeat, PadTo, PoseCompact,
-                                          PoseDecode, PreNormalize2D,
-                                          PreNormalize3D, ToMotion,
-                                          UniformSampleFrames)
+                                          MergeSkeFeat, MMCompact, MMDecode,
+                                          MMUniformSampleFrames, PadTo,
+                                          PoseCompact, PoseDecode,
+                                          PreNormalize2D, PreNormalize3D,
+                                          ToMotion, UniformSampleFrames)
 
 
 class TestPoseTransforms:
@@ -126,23 +127,29 @@ def test_generate_pose_target():
             modality='Pose')
 
         generate_pose_target = GeneratePoseTarget(
-            sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=())
+            sigma=1,
+            with_kp=True,
+            left_kp=(1, ),
+            right_kp=(2, ),
+            left_limb=(0, ),
+            right_limb=(1, ),
+            skeletons=())
         assert str(generate_pose_target) == ('GeneratePoseTarget(sigma=1, '
                                              'use_score=True, with_kp=True, '
                                              'with_limb=False, skeletons=(), '
-                                             'double=False, left_kp=(0,), '
-                                             'right_kp=(1,))')
-        return_results = generate_pose_target(results)
-        assert return_results['imgs'].shape == (8, 64, 64, 3)
+                                             'double=False, left_kp=(1,), '
+                                             'right_kp=(2,), left_limb=(0,), '
+                                             'right_limb=(1,), scaling=1.0)')
+        return_results = generate_pose_target(copy.deepcopy(results))
+        assert return_results['imgs'].shape == (8, 3, 64, 64)
         assert_array_almost_equal(return_results['imgs'][0],
                                   return_results['imgs'][1])
 
         results = dict(img_shape=img_shape, keypoint=kp, modality='Pose')
 
-        generate_pose_target = GeneratePoseTarget(
-            sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=())
-        return_results = generate_pose_target(results)
-        assert return_results['imgs'].shape == (8, 64, 64, 3)
+        generate_pose_target = GeneratePoseTarget(sigma=1, with_kp=True)
+        return_results = generate_pose_target(copy.deepcopy(results))
+        assert return_results['imgs'].shape == (8, 3, 64, 64)
         assert_array_almost_equal(return_results['imgs'][0],
                                   return_results['imgs'][1])
 
@@ -150,37 +157,23 @@ def test_generate_pose_target():
             sigma=1,
             with_kp=False,
             with_limb=True,
-            left_kp=(0, ),
-            right_kp=(1, ),
             skeletons=((0, 1), (1, 2), (0, 2)))
-        return_results = generate_pose_target(results)
-        assert return_results['imgs'].shape == (8, 64, 64, 3)
+        return_results = generate_pose_target(copy.deepcopy(results))
+        assert return_results['imgs'].shape == (8, 3, 64, 64)
         assert_array_almost_equal(return_results['imgs'][0],
                                   return_results['imgs'][1])
 
         generate_pose_target = GeneratePoseTarget(
             sigma=1,
-            with_kp=True,
-            with_limb=True,
-            left_kp=(0, ),
-            right_kp=(1, ),
-            skeletons=((0, 1), (1, 2), (0, 2)))
-        return_results = generate_pose_target(results)
-        assert return_results['imgs'].shape == (8, 64, 64, 6)
-        assert_array_almost_equal(return_results['imgs'][0],
-                                  return_results['imgs'][1])
-
-        generate_pose_target = GeneratePoseTarget(
-            sigma=1,
-            with_kp=True,
+            with_kp=False,
             with_limb=True,
             double=True,
-            left_kp=(0, ),
-            right_kp=(1, ),
+            left_limb=(0, ),
+            right_limb=(1, ),
             skeletons=((0, 1), (1, 2), (0, 2)))
-        return_results = generate_pose_target(results)
+        return_results = generate_pose_target(copy.deepcopy(results))
         imgs = return_results['imgs']
-        assert imgs.shape == (16, 64, 64, 6)
+        assert imgs.shape == (16, 3, 64, 64)
         assert_array_almost_equal(imgs[0], imgs[1])
         assert_array_almost_equal(imgs[:8, 2], imgs[8:, 2, :, ::-1])
         assert_array_almost_equal(imgs[:8, 0], imgs[8:, 1, :, ::-1])
@@ -197,8 +190,8 @@ def test_generate_pose_target():
             keypoint_score=kpscore,
             modality='Pose')
         generate_pose_target = GeneratePoseTarget(
-            sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=())
-        return_results = generate_pose_target(results)
+            sigma=1, with_kp=True, skeletons=())
+        return_results = generate_pose_target(copy.deepcopy(results))
         assert_array_almost_equal(return_results['imgs'], 0)
 
         img_shape = (64, 64)
@@ -215,10 +208,8 @@ def test_generate_pose_target():
             sigma=1,
             with_kp=False,
             with_limb=True,
-            left_kp=(0, ),
-            right_kp=(1, ),
             skeletons=((0, 1), (1, 2), (0, 2)))
-        return_results = generate_pose_target(results)
+        return_results = generate_pose_target(copy.deepcopy(results))
         assert_array_almost_equal(return_results['imgs'], 0)
 
         img_shape = (64, 64)
@@ -231,13 +222,12 @@ def test_generate_pose_target():
             keypoint=kp,
             keypoint_score=kpscore,
             modality='Pose')
-        generate_pose_target = GeneratePoseTarget(
-            sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=())
-        return_results = generate_pose_target(results)
+        generate_pose_target = GeneratePoseTarget(sigma=1, with_kp=True)
+        return_results = generate_pose_target(copy.deepcopy(results))
         assert_array_almost_equal(return_results['imgs'], 0)
 
         img_shape = (64, 64)
-        kp = np.array([[[[124, 124], [140, 140], [124, 140]]]])
+        kp = np.array([[[[124., 124.], [140., 140.], [124., 140.]]]])
         kpscore = np.array([[[0., 0., 0.]]])
         kp = np.concatenate([kp] * 8, axis=1)
         kpscore = np.concatenate([kpscore] * 8, axis=1)
@@ -250,8 +240,6 @@ def test_generate_pose_target():
             sigma=1,
             with_kp=False,
             with_limb=True,
-            left_kp=(0, ),
-            right_kp=(1, ),
             skeletons=((0, 1), (1, 2), (0, 2)))
         return_results = generate_pose_target(results)
         assert_array_almost_equal(return_results['imgs'], 0)
@@ -587,3 +575,143 @@ def test_pose_decode():
         decode_results = pose_decode(results)
         assert_array_almost_equal(decode_results['keypoint'], kp)
         assert_array_almost_equal(decode_results['keypoint_score'], kpscore)
+
+    @staticmethod
+    def test_mm_uniform_sample_frames():
+        results = dict(total_frames=64, modality='Pose')
+        sampling = MMUniformSampleFrames(
+            clip_len=dict(RGB=8, Pose=32), num_clips=1, test_mode=True, seed=0)
+        assert repr(sampling) == ('MMUniformSampleFrames('
+                                  "clip_len={'RGB': 8, 'Pose': 32}, "
+                                  'num_clips=1, test_mode=True, seed=0)')
+
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == dict(RGB=8, Pose=32)
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert sampling_results['modality'] == ['RGB', 'Pose']
+        assert_array_equal(sampling_results['RGB_inds'],
+                           np.array([4, 15, 21, 24, 35, 43, 51, 63]))
+        assert_array_equal(
+            sampling_results['Pose_inds'],
+            np.array([
+                0, 3, 5, 6, 9, 11, 13, 15, 17, 19, 21, 22, 24, 27, 28, 30, 32,
+                34, 36, 39, 40, 43, 45, 46, 48, 51, 53, 55, 57, 58, 61, 62
+            ]))
+
+        results = dict(total_frames=64, modality='Pose')
+        sampling = MMUniformSampleFrames(
+            clip_len=dict(RGB=8, Pose=32),
+            num_clips=10,
+            test_mode=True,
+            seed=0)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == dict(RGB=8, Pose=32)
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 10
+        assert sampling_results['modality'] == ['RGB', 'Pose']
+        assert len(sampling_results['RGB_inds']) == 80
+        assert len(sampling_results['Pose_inds']) == 320
+
+        results = dict(total_frames=64, modality='Pose')
+        sampling = MMUniformSampleFrames(
+            clip_len=dict(RGB=8, Pose=32), num_clips=1, test_mode=False)
+        sampling_results = sampling(results)
+        assert sampling_results['clip_len'] == dict(RGB=8, Pose=32)
+        assert sampling_results['frame_interval'] is None
+        assert sampling_results['num_clips'] == 1
+        assert len(sampling_results['RGB_inds']) == 8
+        assert len(sampling_results['Pose_inds']) == 32
+
+    @staticmethod
+    def test_mm_decode():
+        mm_decode = MMDecode()
+
+        # Pose only test
+        pose_raw_results = dict(
+            modality=['Pose'],
+            Pose_inds=np.array([2, 4, 6, 8, 10]),
+            keypoint=np.random.random([1, 16, 17, 2]),
+            img_shape=(1080, 1920))
+        rgb_raw_results = dict(
+            modality=['RGB'],
+            RGB_inds=np.array([2, 4, 6, 8, 10]),
+            frame_dir=osp.join(osp.dirname(__file__), '../../data/test'))
+
+        # test pose w/o `keypoint_score`
+        mm_decode(copy.deepcopy(pose_raw_results))
+
+        # test pose with `keypoint_score`
+        pose_raw_results['keypoint_score'] = np.random.random([1, 16, 17])
+        pose_results = mm_decode(copy.deepcopy(pose_raw_results))
+
+        # test rgb
+        rgb_results = mm_decode(copy.deepcopy(rgb_raw_results))
+
+        # test pose and rgb
+        pose_rgb_raw_results = {
+            **rgb_raw_results,
+            **pose_raw_results, 'modality': ['RGB', 'Pose']
+        }
+        pose_rgb_results = mm_decode(copy.deepcopy(pose_rgb_raw_results))
+
+        assert_array_equal(pose_rgb_results['keypoint_score'],
+                           pose_results['keypoint_score'])
+        scaled_keypoint = copy.deepcopy(pose_results['keypoint'])
+        oh, ow = pose_results['img_shape']
+        nh, nw = pose_rgb_results['img_shape']
+        scaled_keypoint[..., 0] *= (nw / ow)
+        scaled_keypoint[..., 1] *= (nh / oh)
+        assert_array_equal(pose_rgb_results['keypoint'], scaled_keypoint)
+        assert_array_equal(pose_rgb_results['imgs'], rgb_results['imgs'])
+        assert assert_dict_has_keys(
+            pose_rgb_results, ['filename', 'img_shape', 'original_shape'])
+        assert repr(mm_decode) == 'MMDecode(io_backend=disk)'
+
+    @staticmethod
+    def test_mm_compact():
+        results = {}
+        results['img_shape'] = (100, 100)
+        fake_kp = np.zeros([1, 4, 2, 2])
+        fake_kp[:, :, 0] = [10, 10]
+        fake_kp[:, :, 1] = [90, 90]
+        results['keypoint'] = fake_kp
+        results['imgs'] = list(np.zeros([3, 100, 100, 3]))
+
+        pose_compact = MMCompact(
+            padding=0, threshold=0, hw_ratio=1, allow_imgpad=False)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (80, 80)
+        assert ret['imgs'][0].shape[:-1] == (80, 80)
+        assert str(pose_compact) == (
+            'MMCompact(padding=0, threshold=0, hw_ratio=(1, 1), '
+            'allow_imgpad=False)')
+
+        pose_compact = MMCompact(
+            padding=0.3, threshold=0, hw_ratio=1, allow_imgpad=False)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (100, 100)
+        assert ret['imgs'][0].shape[:-1] == (100, 100)
+
+        pose_compact = MMCompact(
+            padding=0.3, threshold=0, hw_ratio=1, allow_imgpad=True)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (104, 104)
+        assert ret['imgs'][0].shape[:-1] == (104, 104)
+
+        pose_compact = MMCompact(
+            padding=0, threshold=100, hw_ratio=1, allow_imgpad=False)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (100, 100)
+        assert ret['imgs'][0].shape[:-1] == (100, 100)
+
+        pose_compact = MMCompact(
+            padding=0, threshold=0, hw_ratio=0.75, allow_imgpad=True)
+        inp = copy.deepcopy(results)
+        ret = pose_compact(inp)
+        assert ret['img_shape'] == (80, 106)
+        assert ret['imgs'][0].shape[:-1] == (80, 106)
diff --git a/tests/models/backbones/test_resnet3d_slowfast.py b/tests/models/backbones/test_resnet3d_slowfast.py
index a3de73a620..d91e183583 100644
--- a/tests/models/backbones/test_resnet3d_slowfast.py
+++ b/tests/models/backbones/test_resnet3d_slowfast.py
@@ -11,18 +11,13 @@ def test_slowfast_backbone():
     """Test SlowFast backbone."""
     with pytest.raises(TypeError):
         # cfg should be a dict
-        ResNet3dSlowFast(None, slow_pathway=list(['foo', 'bar']))
-    with pytest.raises(TypeError):
-        # pretrained should be a str
-        sf_50 = ResNet3dSlowFast(dict(foo='bar'))
-        sf_50.init_weights()
+        ResNet3dSlowFast(slow_pathway=list(['foo', 'bar']))
     with pytest.raises(KeyError):
         # pathway type should be implemented
-        ResNet3dSlowFast(None, slow_pathway=dict(type='resnext'))
+        ResNet3dSlowFast(slow_pathway=dict(type='resnext'))
 
     # test slowfast with slow inflated
     sf_50_inflate = ResNet3dSlowFast(
-        None,
         slow_pathway=dict(
             type='resnet3d',
             depth=50,
@@ -56,14 +51,7 @@ def test_slowfast_backbone():
     # slowfast w/o lateral connection inference test
     input_shape = (1, 3, 8, 64, 64)
     imgs = generate_backbone_demo_inputs(input_shape)
-    # parrots 3dconv is only implemented on gpu
-    if torch.__version__ == 'parrots':
-        if torch.cuda.is_available():
-            sf_50_wo_lateral = sf_50_wo_lateral.cuda()
-            imgs_gpu = imgs.cuda()
-            feat = sf_50_wo_lateral(imgs_gpu)
-    else:
-        feat = sf_50_wo_lateral(imgs)
+    feat = sf_50_wo_lateral(imgs)
 
     assert isinstance(feat, tuple)
     assert feat[0].shape == torch.Size([1, 2048, 1, 2, 2])
@@ -104,21 +92,14 @@ def test_slowfast_backbone():
                 assert param.requires_grad is True
 
     # test slowfast with normal config
-    sf_50 = ResNet3dSlowFast(None)
+    sf_50 = ResNet3dSlowFast()
     sf_50.init_weights()
     sf_50.train()
 
     # slowfast inference test
     input_shape = (1, 3, 8, 64, 64)
     imgs = generate_backbone_demo_inputs(input_shape)
-    # parrots 3dconv is only implemented on gpu
-    if torch.__version__ == 'parrots':
-        if torch.cuda.is_available():
-            sf_50 = sf_50.cuda()
-            imgs_gpu = imgs.cuda()
-            feat = sf_50(imgs_gpu)
-    else:
-        feat = sf_50(imgs)
+    feat = sf_50(imgs)
 
     assert isinstance(feat, tuple)
     assert feat[0].shape == torch.Size([1, 2048, 1, 2, 2])
diff --git a/tests/models/backbones/test_resnet3d_slowonly.py b/tests/models/backbones/test_resnet3d_slowonly.py
index 9603469c37..47c7036451 100644
--- a/tests/models/backbones/test_resnet3d_slowonly.py
+++ b/tests/models/backbones/test_resnet3d_slowonly.py
@@ -10,7 +10,7 @@ def test_slowonly_backbone():
     """Test SlowOnly backbone."""
     with pytest.raises(AssertionError):
         # SlowOnly should contain no lateral connection
-        ResNet3dSlowOnly(50, None, lateral=True)
+        ResNet3dSlowOnly(depth=50, pretrained=None, lateral=True)
 
     # test SlowOnly for PoseC3D
     so_50 = ResNet3dSlowOnly(
@@ -31,7 +31,7 @@ def test_slowonly_backbone():
     so_50.train()
 
     # test SlowOnly with normal config
-    so_50 = ResNet3dSlowOnly(50, None)
+    so_50 = ResNet3dSlowOnly(depth=50, pretrained=None)
     so_50.init_weights()
     so_50.train()
 
diff --git a/tests/models/backbones/test_rgbposeconv3d.py b/tests/models/backbones/test_rgbposeconv3d.py
new file mode 100644
index 0000000000..848a73ab45
--- /dev/null
+++ b/tests/models/backbones/test_rgbposeconv3d.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import RGBPoseConv3D
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_rgbposeconv3d():
+    """Test RGBPoseConv3D backbone."""
+
+    with pytest.raises(AssertionError):
+        RGBPoseConv3D(pose_drop_path=1.1, rgb_drop_path=1.1)
+
+    rgbposec3d = RGBPoseConv3D()
+    rgbposec3d.init_weights()
+    rgbposec3d.train()
+
+    imgs_shape = (1, 3, 8, 224, 224)
+    heatmap_imgs_shape = (1, 17, 32, 56, 56)
+    imgs = generate_backbone_demo_inputs(imgs_shape)
+    heatmap_imgs = generate_backbone_demo_inputs(heatmap_imgs_shape)
+
+    (x_rgb, x_pose) = rgbposec3d(imgs, heatmap_imgs)
+
+    assert x_rgb.shape == torch.Size([1, 2048, 8, 7, 7])
+    assert x_pose.shape == torch.Size([1, 512, 32, 7, 7])
diff --git a/tests/models/data_preprocessors/__init__.py b/tests/models/data_preprocessors/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/tests/models/data_preprocessors/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/tests/models/data_preprocessors/test_data_preprocessor.py b/tests/models/data_preprocessors/test_data_preprocessor.py
new file mode 100644
index 0000000000..a4a3d851d7
--- /dev/null
+++ b/tests/models/data_preprocessors/test_data_preprocessor.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+
+import pytest
+import torch
+from numpy.testing import assert_array_equal
+
+from mmaction.models import ActionDataPreprocessor
+from mmaction.structures import ActionDataSample
+from mmaction.utils import register_all_modules
+
+
+def generate_dummy_data(batch_size, input_shape):
+    data = {
+        'inputs':
+        [torch.randint(0, 255, input_shape) for _ in range(batch_size)],
+        'data_samples':
+        [ActionDataSample().set_gt_labels(2) for _ in range(batch_size)]
+    }
+    return data
+
+
+def test_data_preprocessor():
+    with pytest.raises(ValueError):
+        ActionDataPreprocessor(
+            mean=[1, 1], std=[0, 0], format_shape='NCTHW_Heatmap')
+    with pytest.raises(ValueError):
+        psr = ActionDataPreprocessor(format_shape='NCTHW_Heatmap', to_rgb=True)
+        psr(generate_dummy_data(1, (3, 224, 224)))
+
+    raw_data = generate_dummy_data(2, (1, 3, 8, 224, 224))
+    psr = ActionDataPreprocessor(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW')
+    data = psr(deepcopy(raw_data))
+    assert data['inputs'].shape == (2, 1, 3, 8, 224, 224)
+    assert_array_equal(data['inputs'][0],
+                       (raw_data['inputs'][0] - psr.mean) / psr.std)
+    assert_array_equal(data['inputs'][1],
+                       (raw_data['inputs'][1] - psr.mean) / psr.std)
+
+    psr = ActionDataPreprocessor(format_shape='NCTHW', to_rgb=True)
+    data = psr(deepcopy(raw_data))
+    assert data['inputs'].shape == (2, 1, 3, 8, 224, 224)
+    assert_array_equal(data['inputs'][0], raw_data['inputs'][0][:, [2, 1, 0]])
+    assert_array_equal(data['inputs'][1], raw_data['inputs'][1][:, [2, 1, 0]])
+
+    register_all_modules()
+    psr = ActionDataPreprocessor(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW',
+        blending=dict(type='MixupBlending', num_classes=5))
+    data = psr(deepcopy(raw_data), training=True)
+    assert data['data_samples'][0].gt_labels.item.shape == (5, )
+    assert data['data_samples'][1].gt_labels.item.shape == (5, )
+
+    raw_data = generate_dummy_data(2, (1, 3, 224, 224))
+    psr = ActionDataPreprocessor(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCHW',
+        to_rgb=True)
+    data = psr(deepcopy(raw_data))
+    assert_array_equal(data['inputs'][0],
+                       (raw_data['inputs'][0][:, [2, 1, 0]] - psr.mean) /
+                       psr.std)
+    assert_array_equal(data['inputs'][1],
+                       (raw_data['inputs'][1][:, [2, 1, 0]] - psr.mean) /
+                       psr.std)
+
+    psr = ActionDataPreprocessor()
+    data = psr(deepcopy(raw_data))
+    assert data['inputs'].shape == (2, 1, 3, 224, 224)
+    assert_array_equal(data['inputs'][0], raw_data['inputs'][0])
+    assert_array_equal(data['inputs'][1], raw_data['inputs'][1])
+
+    raw_2d_data = generate_dummy_data(2, (3, 224, 224))
+    raw_3d_data = generate_dummy_data(2, (1, 3, 8, 224, 224))
+    raw_data = (raw_2d_data, raw_3d_data)
+
+    psr = ActionDataPreprocessor(
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='MIX2d3d')
+    data = psr(raw_data)
+    assert_array_equal(data[0]['inputs'][0],
+                       (raw_2d_data['inputs'][0] - psr.mean.view(-1, 1, 1)) /
+                       psr.std.view(-1, 1, 1))
+    assert_array_equal(data[0]['inputs'][1],
+                       (raw_2d_data['inputs'][1] - psr.mean.view(-1, 1, 1)) /
+                       psr.std.view(-1, 1, 1))
+    assert_array_equal(data[1]['inputs'][0],
+                       (raw_3d_data['inputs'][0] - psr.mean) / psr.std)
+    assert_array_equal(data[1]['inputs'][1],
+                       (raw_3d_data['inputs'][1] - psr.mean) / psr.std)
diff --git a/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py b/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py
new file mode 100644
index 0000000000..35483bd5d9
--- /dev/null
+++ b/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import pytest
+import torch
+from numpy.testing import assert_array_equal
+
+from mmaction.models import MultiModalDataPreprocessor
+from mmaction.structures import ActionDataSample
+from mmaction.utils import register_all_modules
+
+
+def generate_dummy_data(batch_size, input_keys, input_shapes):
+    data = dict()
+    data['data_samples'] = [
+        ActionDataSample().set_gt_labels(2) for _ in range(batch_size)
+    ]
+    data['inputs'] = dict()
+    for key, shape in zip(input_keys, input_shapes):
+        data['inputs'][key] = [
+            torch.randint(0, 255, shape) for _ in range(batch_size)
+        ]
+
+    return data
+
+
+def test_multimodal_data_preprocessor():
+    with pytest.raises(AssertionError):
+        MultiModalDataPreprocessor(
+            preprocessors=dict(imgs=dict(format_shape='NCTHW')))
+
+    register_all_modules()
+    data_keys = ('imgs', 'heatmap_imgs')
+    data_shapes = ((1, 3, 8, 224, 224), (1, 17, 32, 64, 64))
+    raw_data = generate_dummy_data(2, data_keys, data_shapes)
+
+    psr = MultiModalDataPreprocessor(
+        preprocessors=dict(
+            imgs=dict(
+                type='ActionDataPreprocessor',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                format_shape='NCTHW'),
+            heatmap_imgs=dict(type='ActionDataPreprocessor')))
+
+    data = psr(copy.deepcopy(raw_data))
+    assert data['inputs']['imgs'].shape == (2, 1, 3, 8, 224, 224)
+    assert data['inputs']['heatmap_imgs'].shape == (2, 1, 17, 32, 64, 64)
+    psr_imgs = psr.preprocessors['imgs']
+    assert_array_equal(data['inputs']['imgs'][0],
+                       (raw_data['inputs']['imgs'][0] - psr_imgs.mean) /
+                       psr_imgs.std)
+    assert_array_equal(data['inputs']['imgs'][1],
+                       (raw_data['inputs']['imgs'][1] - psr_imgs.mean) /
+                       psr_imgs.std)
+    assert_array_equal(data['inputs']['heatmap_imgs'][0],
+                       raw_data['inputs']['heatmap_imgs'][0])
+    assert_array_equal(data['inputs']['heatmap_imgs'][1],
+                       raw_data['inputs']['heatmap_imgs'][1])
+
+    data_keys = ('imgs_2D', 'imgs_3D')
+    data_shapes = ((1, 3, 224, 224), (1, 3, 8, 224, 224))
+    raw_data = generate_dummy_data(2, data_keys, data_shapes)
+
+    psr = MultiModalDataPreprocessor(
+        preprocessors=dict(
+            imgs_2D=dict(
+                type='ActionDataPreprocessor',
+                mean=[123.675, 116.28, 103.53],
+                std=[58.395, 57.12, 57.375],
+                format_shape='NCHW'),
+            imgs_3D=dict(
+                type='ActionDataPreprocessor',
+                mean=[127.5, 127.5, 127.5],
+                std=[57.5, 57.5, 57.5],
+                format_shape='NCTHW')))
+
+    data = psr(copy.deepcopy(raw_data))
+    assert data['inputs']['imgs_2D'].shape == (2, 1, 3, 224, 224)
+    assert data['inputs']['imgs_3D'].shape == (2, 1, 3, 8, 224, 224)
+    psr_imgs2d = psr.preprocessors['imgs_2D']
+    psr_imgs3d = psr.preprocessors['imgs_3D']
+    assert_array_equal(data['inputs']['imgs_2D'][0],
+                       (raw_data['inputs']['imgs_2D'][0] - psr_imgs2d.mean) /
+                       psr_imgs2d.std)
+    assert_array_equal(data['inputs']['imgs_2D'][1],
+                       (raw_data['inputs']['imgs_2D'][1] - psr_imgs2d.mean) /
+                       psr_imgs2d.std)
+    assert_array_equal(data['inputs']['imgs_3D'][0],
+                       (raw_data['inputs']['imgs_3D'][0] - psr_imgs3d.mean) /
+                       psr_imgs3d.std)
+    assert_array_equal(data['inputs']['imgs_3D'][1],
+                       (raw_data['inputs']['imgs_3D'][1] - psr_imgs3d.mean) /
+                       psr_imgs3d.std)
diff --git a/tests/models/heads/test_rgbpose_head.py b/tests/models/heads/test_rgbpose_head.py
new file mode 100644
index 0000000000..919e02a4bd
--- /dev/null
+++ b/tests/models/heads/test_rgbpose_head.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.models import RGBPoseHead
+
+
+def test_rgbpose_head():
+    """Test RGBPoseHead."""
+    rgbpose_head = RGBPoseHead(
+        num_classes=4,
+        in_channels=[2048, 512],
+        dropout=dict(rgb=0.51, pose=0.49))
+    rgbpose_head.init_weights()
+
+    assert rgbpose_head.num_classes == 4
+    assert rgbpose_head.dropout == dict(rgb=0.51, pose=0.49)
+    assert rgbpose_head.in_channels == [2048, 512]
+    assert rgbpose_head.init_std == 0.01
+
+    assert isinstance(rgbpose_head.dropout_rgb, nn.Dropout)
+    assert isinstance(rgbpose_head.dropout_pose, nn.Dropout)
+    assert rgbpose_head.dropout_rgb.p == rgbpose_head.dropout['rgb']
+    assert rgbpose_head.dropout_pose.p == rgbpose_head.dropout['pose']
+
+    assert isinstance(rgbpose_head.fc_rgb, nn.Linear)
+    assert isinstance(rgbpose_head.fc_pose, nn.Linear)
+    assert rgbpose_head.fc_rgb.in_features == rgbpose_head.in_channels[0]
+    assert rgbpose_head.fc_rgb.out_features == rgbpose_head.num_classes
+    assert rgbpose_head.fc_pose.in_features == rgbpose_head.in_channels[1]
+    assert rgbpose_head.fc_pose.out_features == rgbpose_head.num_classes
+
+    assert isinstance(rgbpose_head.avg_pool, nn.AdaptiveAvgPool3d)
+    assert rgbpose_head.avg_pool.output_size == (1, 1, 1)
+
+    feat_rgb = torch.rand((2, 2048, 8, 7, 7))
+    feat_pose = torch.rand((2, 512, 32, 7, 7))
+
+    cls_scores = rgbpose_head((feat_rgb, feat_pose))
+    assert cls_scores['rgb'].shape == torch.Size([2, 4])
+    assert cls_scores['pose'].shape == torch.Size([2, 4])
diff --git a/tools/data/skeleton/compress_nturgbd.py b/tools/data/skeleton/compress_nturgbd.py
new file mode 100644
index 0000000000..b8639257c9
--- /dev/null
+++ b/tools/data/skeleton/compress_nturgbd.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import multiprocessing as mp
+import os
+import os.path as osp
+import subprocess
+
+
+def get_shape(vid):
+    cmd = 'ffprobe -v error -select_streams v:0 -show_entries ' \
+          'stream=width,height -of csv=s=x:p=0 \"{}\"'.format(vid)
+    w, h = subprocess.check_output(cmd, shell=True).decode('utf-8').split('x')
+    return int(w), int(h)
+
+
+def compress(src, dest, shape=None, target_size=540, fps=-1):
+    if shape is None:
+        shape = get_shape(src)
+    w, h = shape
+    scale_str = f'-vf scale=-2:{target_size}' if w >= h else \
+        f'-vf scale={target_size}:-2'
+    fps_str = f'-r {fps}' if fps > 0 else ''
+    quality_str = '-q:v 1'
+    vcodec_str = '-c:v libx264'
+    cmd = f'ffmpeg -y -loglevel error -i {src} -threads 1 ' \
+          f'{quality_str} {scale_str} {fps_str} {vcodec_str} {dest}'
+    os.system(cmd)
+
+
+def compress_nturgbd(name):
+    src = name
+    dest = src.replace('nturgbd_raw',
+                       'nturgbd_videos').replace('_rgb.avi', '.mp4')
+    shape = (1920, 1080)
+    compress(src, dest, shape)
+
+
+src_dir = 'data/nturgbd_raw'
+tgt_dir = 'data/nturgbd_videos'
+os.makedirs(tgt_dir, exist_ok=True)
+files = [osp.join(src_dir, x) for x in os.listdir(src_dir) if '.avi' in x]
+pool = mp.Pool(32)
+pool.map(compress_nturgbd, files)

From acb79e41c1d9288806a359ded943c949224465c3 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Wed, 1 Mar 2023 20:30:01 +0800
Subject: [PATCH 09/36] [fix] specify map_location to cpu when use
 _load_checkpoint (#2254)

---
 mmaction/models/backbones/resnet.py            | 3 ++-
 mmaction/models/backbones/resnet3d.py          | 2 +-
 mmaction/models/backbones/resnet3d_slowfast.py | 2 +-
 mmaction/models/backbones/timesformer.py       | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/mmaction/models/backbones/resnet.py b/mmaction/models/backbones/resnet.py
index 0ebf6d61b0..c599bcc311 100644
--- a/mmaction/models/backbones/resnet.py
+++ b/mmaction/models/backbones/resnet.py
@@ -497,7 +497,8 @@ def _load_bn_params(bn: nn.Module, state_dict_tv: OrderedDict,
     def _load_torchvision_checkpoint(self,
                                      logger: mmengine.MMLogger = None) -> None:
         """Initiate the parameters from torchvision pretrained checkpoint."""
-        state_dict_torchvision = _load_checkpoint(self.pretrained)
+        state_dict_torchvision = _load_checkpoint(
+            self.pretrained, map_location='cpu')
         if 'state_dict' in state_dict_torchvision:
             state_dict_torchvision = state_dict_torchvision['state_dict']
 
diff --git a/mmaction/models/backbones/resnet3d.py b/mmaction/models/backbones/resnet3d.py
index cbaa4e18ca..63b32fc8cd 100644
--- a/mmaction/models/backbones/resnet3d.py
+++ b/mmaction/models/backbones/resnet3d.py
@@ -723,7 +723,7 @@ def _inflate_weights(self, logger: MMLogger) -> None:
                 debugging information.
         """
 
-        state_dict_r2d = _load_checkpoint(self.pretrained)
+        state_dict_r2d = _load_checkpoint(self.pretrained, map_location='cpu')
         if 'state_dict' in state_dict_r2d:
             state_dict_r2d = state_dict_r2d['state_dict']
 
diff --git a/mmaction/models/backbones/resnet3d_slowfast.py b/mmaction/models/backbones/resnet3d_slowfast.py
index c4ca8b8032..3083239ff9 100644
--- a/mmaction/models/backbones/resnet3d_slowfast.py
+++ b/mmaction/models/backbones/resnet3d_slowfast.py
@@ -214,7 +214,7 @@ def inflate_weights(self, logger: MMLogger) -> None:
                 debugging information.
         """
 
-        state_dict_r2d = _load_checkpoint(self.pretrained)
+        state_dict_r2d = _load_checkpoint(self.pretrained, map_location='cpu')
         if 'state_dict' in state_dict_r2d:
             state_dict_r2d = state_dict_r2d['state_dict']
 
diff --git a/mmaction/models/backbones/timesformer.py b/mmaction/models/backbones/timesformer.py
index 618b381295..af636b5198 100644
--- a/mmaction/models/backbones/timesformer.py
+++ b/mmaction/models/backbones/timesformer.py
@@ -235,7 +235,7 @@ def init_weights(self, pretrained=None):
             logger = MMLogger.get_current_instance()
             logger.info(f'load model from: {self.pretrained}')
 
-            state_dict = _load_checkpoint(self.pretrained)
+            state_dict = _load_checkpoint(self.pretrained, map_location='cpu')
             if 'state_dict' in state_dict:
                 state_dict = state_dict['state_dict']
 

From 4e9b7ec3d2c241add5446d394b78e398ef407a76 Mon Sep 17 00:00:00 2001
From: Kai Hu <kaiorhu@gmail.com>
Date: Mon, 6 Mar 2023 05:15:07 -0500
Subject: [PATCH 10/36] [Fix] fix command bugs in localization tasks' README
 (#2244)

---
 configs/localization/bmn/README.md                            | 4 ++--
 configs/localization/bsn/README.md                            | 2 +-
 .../bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py      | 2 ++
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/configs/localization/bmn/README.md b/configs/localization/bmn/README.md
index 834df03ad5..2f49330743 100644
--- a/configs/localization/bmn/README.md
+++ b/configs/localization/bmn/README.md
@@ -39,7 +39,7 @@ For more details on data preparation, you can refer to [ActivityNet Data Prepara
 Train BMN model on ActivityNet features dataset.
 
 ```shell
-bash tools/dist_train.sh configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py 2
+bash tools/dist_train.sh configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py 2
 ```
 
 For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
@@ -49,7 +49,7 @@ For more details, you can refer to the **Training** part in the [Training and Te
 Test BMN on ActivityNet feature dataset.
 
 ```shell
-python3 tools/test.py  configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py CHECKPOINT.PTH
+python3 tools/test.py  configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py CHECKPOINT.PTH
 ```
 
 For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
diff --git a/configs/localization/bsn/README.md b/configs/localization/bsn/README.md
index 62c46f6782..efd2d2c0d0 100644
--- a/configs/localization/bsn/README.md
+++ b/configs/localization/bsn/README.md
@@ -42,7 +42,7 @@ python3 tools/train.py configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activi
 After training use the TEM module to generate the probabilities sequence (actionness, starting, and ending) for the training and validation dataset:
 
 ```shell
-python tools/test.py configs/localization/bsn/bsn_tem_400x100_1xb16_20e_activitynet_feature.py \
+python tools/test.py configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py \
     work_dirs/bsn_400x100_20e_1xb16_activitynet_feature/tem_epoch_20.pth
 ```
 
diff --git a/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py b/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py
index 28595bb786..285306f976 100644
--- a/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py
+++ b/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py
@@ -89,3 +89,5 @@
     metric_type='TEM',
     dump_config=dict(out=tem_results_dir, output_format='csv'))
 val_evaluator = test_evaluator
+
+default_hooks = dict(checkpoint=dict(filename_tmpl='tem_epoch_{}.pth'))

From edd7dee3bcb4f604ffd0a38ac98f8c09176dde75 Mon Sep 17 00:00:00 2001
From: wxDai <daiwenxun@pjlab.org.cn>
Date: Thu, 9 Mar 2023 02:50:25 +0800
Subject: [PATCH 11/36] [Project] Add Example project (#2265)

---
 .gitignore                                    |   2 +
 docs/en/notes/contribution_guide.md           |   7 +-
 projects/README.md                            |  17 +++
 projects/example_project/README.md            | 122 ++++++++++++++++++
 ...1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py |  11 ++
 projects/example_project/models/__init__.py   |   3 +
 .../example_project/models/example_net.py     |  21 +++
 7 files changed, 180 insertions(+), 3 deletions(-)
 create mode 100644 projects/README.md
 create mode 100644 projects/example_project/README.md
 create mode 100644 projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py
 create mode 100644 projects/example_project/models/__init__.py
 create mode 100644 projects/example_project/models/example_net.py

diff --git a/.gitignore b/.gitignore
index b2c1be8fa6..3e40ace4d5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -113,6 +113,8 @@ venv.bak/
 *.log.json
 benchlist.txt
 work_dirs/
+/projects/*/work_dirs
+/projects/*/data
 .DS_Store
 
 # Pytorch
diff --git a/docs/en/notes/contribution_guide.md b/docs/en/notes/contribution_guide.md
index 92548868d2..f9d96c75a5 100644
--- a/docs/en/notes/contribution_guide.md
+++ b/docs/en/notes/contribution_guide.md
@@ -33,10 +33,11 @@ We use the following tools for linting and formatting:
 - [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
 - [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
 
-Style configurations of yapf and isort can be found in [setup.cfg](../../../setup.cfg).
+Style configurations of yapf and isort can be found in [setup.cfg](https://github.com/open-mmlab/mmaction2/blob/1.x/setup.cfg).
 
-We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, fixes `end-of-files`, sorts `requirments.txt` automatically on every commit.
-The config for a pre-commit hook is stored in [.pre-commit-config](../../../.pre-commit-config.yaml).
+We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
+fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
+The config for a pre-commit hook is stored in [.pre-commit-config](https://github.com/open-mmlab/mmaction2/blob/1.x/.pre-commit-config.yaml).
 
 After you clone the repository, you will need to install initialize pre-commit hook.
 
diff --git a/projects/README.md b/projects/README.md
new file mode 100644
index 0000000000..7e12abee97
--- /dev/null
+++ b/projects/README.md
@@ -0,0 +1,17 @@
+# Welcome to Projects of MMAction2
+
+In this folder, we welcome all contribution of deep-learning video understanding models from community.
+
+Here, these requirements, e.g., code standards, are not that strict as in the core package. Thus, developers from the community can implement their algorithms much more easily and efficiently in MMAction2. We appreciate all contributions from community to make MMAction2 greater.
+
+Here is an [example project](./example_project) about how to add your algorithms easily.
+
+We also provide some documentation listed below:
+
+- [Contribution Guide](https://mmaction2.readthedocs.io/en/dev-1.x/notes/contribution_guide.html)
+
+  The guides for new contributors about how to add your projects to MMAction2.
+
+- [Discussions](https://github.com/open-mmlab/mmaction2/discussions)
+
+  Welcome to start discussion!
diff --git a/projects/example_project/README.md b/projects/example_project/README.md
new file mode 100644
index 0000000000..ef74fe9cbe
--- /dev/null
+++ b/projects/example_project/README.md
@@ -0,0 +1,122 @@
+# Example Project
+
+This is an example README for community `projects/`. You can write your README in your own project. Here are
+some recommended parts of a README for others to understand and use your project, you can copy or modify them
+according to your project.
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2.
+
+At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the Kinetics400 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/kinetics/README.md).
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc |  testing protocol  |                                     config                                      |              ckpt |             log |
+| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :-----------------------------------------------------------------------------: | ----------------: | --------------: |
+|          1x1x3          |  224x224   |  8   | ResNet50 | ImageNet |  72.83   |  90.65   | 25 clips x 10 crop | [config](./configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](CKPT-LINK) | [log](LOG-LINK) |
+
+## Citation
+
+<!-- Replace to the citation of the paper your project refers to. -->
+
+```bibtex
+@misc{2020mmaction2,
+  title={OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark},
+  author={MMAction2 Contributors},
+  howpublished = {\url{https://github.com/open-mmlab/mmaction2}},
+  year={2020}
+}
+```
+
+## Checklist
+
+Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects.
+
+- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [ ] Finish the code
+
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmaction.registry.MODELS` and configurable via a config file. -->
+
+  - [ ] Basic docstrings & proper citation
+
+    <!-- Each major class should contains a docstring, describing its functionality and arguments. If your code is copied or modified from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [ ] Converted checkpoint and results (Only for reproduction)
+
+    <!-- If you are reproducing the result from a paper, make sure the model in the project can match that results. Also please provide checkpoint links or a checkpoint conversion script for others to get the pre-trained model. -->
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training results
+
+    <!-- If you are reproducing the result from a paper, train your model from scratch and verified that the final result can match the original result. Usually, ±0.1% is acceptable for the action recognition task on Kinetics400. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Unit tests
+
+    <!-- Unit tests for the major module are required. [Example](https://github.com/open-mmlab/mmaction2/blob/1.x/tests/models/backbones/test_resnet.py) -->
+
+  - [ ] Code style
+
+    <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] `metafile.yml` and `README.md`
+
+    <!-- It will used for MMAction2 to acquire your models. [Example](https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/swin/metafile.yml). In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/swin/README.md) -->
diff --git a/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py b/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py
new file mode 100644
index 0000000000..32ea002651
--- /dev/null
+++ b/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py
@@ -0,0 +1,11 @@
+# Directly inherit the entire recipe you want to use.
+_base_ = 'mmaction::recognition/tsn/' \
+         'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'
+
+# This line is to import your own modules.
+custom_imports = dict(imports='models')
+
+# Modify the backbone to use your own backbone.
+_base_['model']['backbone'] = dict(type='ExampleNet', depth=50)
+# Modify the in_channels of classifier head to fit your backbone.
+_base_['model']['cls_head']['in_channels'] = 2048
diff --git a/projects/example_project/models/__init__.py b/projects/example_project/models/__init__.py
new file mode 100644
index 0000000000..e2d4f2f571
--- /dev/null
+++ b/projects/example_project/models/__init__.py
@@ -0,0 +1,3 @@
+from .example_net import ExampleNet
+
+__all__ = ['ExampleNet']
diff --git a/projects/example_project/models/example_net.py b/projects/example_project/models/example_net.py
new file mode 100644
index 0000000000..6a3b8bbb06
--- /dev/null
+++ b/projects/example_project/models/example_net.py
@@ -0,0 +1,21 @@
+from mmaction.models import ResNet
+from mmaction.registry import MODELS
+
+
+# Register your model to the `MODELS`.
+@MODELS.register_module()
+class ExampleNet(ResNet):
+    """Implements an example backbone.
+
+    Implement the backbone network just like a normal pytorch network.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        print('#############################\n'
+              '#      Hello MMAction2!     #\n'
+              '#############################')
+        super().__init__(**kwargs)
+
+    def forward(self, x):
+        """Defines the computation performed at every call."""
+        return super().forward(x)

From e38a41200e7700c3e57a39f7d1fc00149aff36da Mon Sep 17 00:00:00 2001
From: wxDai <daiwenxun@pjlab.org.cn>
Date: Tue, 14 Mar 2023 03:16:36 +0800
Subject: [PATCH 12/36] [Project] Add MSG3D project (#2291)

---
 projects/msg3d/README.md                      | 143 ++++++++
 ...6-joint-u100-80e_ntu60-xsub-keypoint-2d.py | 104 ++++++
 ...6-joint-u100-80e_ntu60-xsub-keypoint-3d.py | 104 ++++++
 projects/msg3d/models/__init__.py             |   3 +
 projects/msg3d/models/msg3d.py                |  75 ++++
 projects/msg3d/models/msg3d_utils.py          | 342 ++++++++++++++++++
 6 files changed, 771 insertions(+)
 create mode 100644 projects/msg3d/README.md
 create mode 100644 projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
 create mode 100644 projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
 create mode 100644 projects/msg3d/models/__init__.py
 create mode 100644 projects/msg3d/models/msg3d.py
 create mode 100644 projects/msg3d/models/msg3d_utils.py

diff --git a/projects/msg3d/README.md b/projects/msg3d/README.md
new file mode 100644
index 0000000000..7c784f90aa
--- /dev/null
+++ b/projects/msg3d/README.md
@@ -0,0 +1,143 @@
+# MSG3D Project
+
+[Disentangling and Unifying Graph Convolutions for Skeleton-Based Action Recognition](https://arxiv.org/abs/2003.14111)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Spatial-temporal graphs have been widely used by skeleton-based action recognition algorithms to model human action dynamics. To capture robust movement patterns from these graphs, long-range and multi-scale context aggregation and spatial-temporal dependency modeling are critical aspects of a powerful feature extractor. However, existing methods have limitations in achieving (1) unbiased long-range joint relationship modeling under multi-scale operators and (2) unobstructed cross-spacetime information flow for capturing complex spatial-temporal dependencies. In this work, we present (1) a simple method to disentangle multi-scale graph convolutions and (2) a unified spatial-temporal graph convolutional operator named G3D. The proposed multi-scale aggregation scheme disentangles the importance of nodes in different neighborhoods for effective long-range modeling. The proposed G3D module leverages dense cross-spacetime edges as skip connections for direct information propagation across the spatial-temporal graph. By coupling these proposals, we develop a powerful feature extractor named MS-G3D based on which our model outperforms previous state-of-the-art methods on three large-scale datasets: NTU RGB+D 60, NTU RGB+D 120, and Kinetics Skeleton 400.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/58767402/223127347-135bb92b-2dee-46d9-95fc-cebf65c27fc8.png" width="800"/>
+</div>
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2.
+
+At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md).
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+### NTU60_XSub_2D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol |                     config                     |                     ckpt                     |                     log                     |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+|       uniform 100       |  joint   |  8   |  MSG3D   |   92.3   |     10 clips     | [config](./configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20230309-73b97296.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) |
+
+### NTU60_XSub_3D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol |                     config                     |                     ckpt                     |                     log                     |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+|       uniform 100       |  joint   |  8   |  MSG3D   |   89.6   |     10 clips     | [config](./configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20230308-c325d222.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) |
+
+## Citation
+
+<!-- Replace to the citation of the paper your project refers to. -->
+
+```bibtex
+@inproceedings{liu2020disentangling,
+  title={Disentangling and unifying graph convolutions for skeleton-based action recognition},
+  author={Liu, Ziyu and Zhang, Hongwen and Chen, Zhenghao and Wang, Zhiyong and Ouyang, Wanli},
+  booktitle={CVPR},
+  pages={143--152},
+  year={2020}
+}
+```
+
+## Checklist
+
+Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects.
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmaction.registry.MODELS` and configurable via a config file. -->
+
+  - [x] Basic docstrings & proper citation
+
+    <!-- Each major class should contains a docstring, describing its functionality and arguments. If your code is copied or modified from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [x] Converted checkpoint and results (Only for reproduction)
+
+    <!-- If you are reproducing the result from a paper, make sure the model in the project can match that results. Also please provide checkpoint links or a checkpoint conversion script for others to get the pre-trained model. -->
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training results
+
+    <!-- If you are reproducing the result from a paper, train your model from scratch and verified that the final result can match the original result. Usually, ±0.1% is acceptable for the action recognition task on Kinetics400. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Unit tests
+
+    <!-- Unit tests for the major module are required. [Example](https://github.com/open-mmlab/mmaction2/blob/1.x/tests/models/backbones/test_resnet.py) -->
+
+  - [ ] Code style
+
+    <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] `metafile.yml` and `README.md`
+
+    <!-- It will used for MMAction2 to acquire your models. [Example](https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/swin/metafile.yml). In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/swin/README.md) -->
diff --git a/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
new file mode 100644
index 0000000000..ece30dc019
--- /dev/null
+++ b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+    type='RecognizerGCN',
+    backbone=dict(
+        type='MSG3D', graph_cfg=dict(layout='coco', mode='binary_adj')),
+    cls_head=dict(type='GCNHead', num_classes=60, in_channels=384))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+train_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(type='UniformSampleFrames', clip_len=100),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=10,
+        test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            pipeline=train_pipeline,
+            split='xsub_train')))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=val_pipeline,
+        split='xsub_val',
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=test_pipeline,
+        split='xsub_val',
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=16,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
new file mode 100644
index 0000000000..290fda984d
--- /dev/null
+++ b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+    type='RecognizerGCN',
+    backbone=dict(
+        type='MSG3D', graph_cfg=dict(layout='nturgb+d', mode='binary_adj')),
+    cls_head=dict(type='GCNHead', num_classes=60, in_channels=384))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_3d.pkl'
+train_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(type='UniformSampleFrames', clip_len=100),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=10,
+        test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            pipeline=train_pipeline,
+            split='xsub_train')))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=val_pipeline,
+        split='xsub_val',
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=test_pipeline,
+        split='xsub_val',
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=16,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/msg3d/models/__init__.py b/projects/msg3d/models/__init__.py
new file mode 100644
index 0000000000..82b4a3085c
--- /dev/null
+++ b/projects/msg3d/models/__init__.py
@@ -0,0 +1,3 @@
+from .msg3d import MSG3D
+
+__all__ = ['MSG3D']
diff --git a/projects/msg3d/models/msg3d.py b/projects/msg3d/models/msg3d.py
new file mode 100644
index 0000000000..e4124a3435
--- /dev/null
+++ b/projects/msg3d/models/msg3d.py
@@ -0,0 +1,75 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule, Sequential
+
+from mmaction.models.utils import Graph
+from mmaction.registry import MODELS
+from .msg3d_utils import MSGCN, MSTCN, MW_MSG3DBlock
+
+
+@MODELS.register_module()
+class MSG3D(BaseModule):
+
+    def __init__(self,
+                 graph_cfg,
+                 in_channels=3,
+                 base_channels=96,
+                 num_gcn_scales=13,
+                 num_g3d_scales=6,
+                 num_person=2,
+                 tcn_dropout=0):
+        super().__init__()
+
+        self.graph = Graph(**graph_cfg)
+        # Note that A is a 2D tensor
+        A = torch.tensor(
+            self.graph.A[0], dtype=torch.float32, requires_grad=False)
+        self.register_buffer('A', A)
+        self.num_point = A.shape[-1]
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+
+        self.data_bn = nn.BatchNorm1d(self.num_point * in_channels *
+                                      num_person)
+        c1, c2, c3 = base_channels, base_channels * 2, base_channels * 4
+
+        # r=3 STGC blocks
+        self.gcn3d1 = MW_MSG3DBlock(3, c1, A, num_g3d_scales, window_stride=1)
+        self.sgcn1 = Sequential(
+            MSGCN(num_gcn_scales, 3, c1, A), MSTCN(c1, c1), MSTCN(c1, c1))
+        self.sgcn1[-1].act = nn.Identity()
+        self.tcn1 = MSTCN(c1, c1, tcn_dropout=tcn_dropout)
+
+        self.gcn3d2 = MW_MSG3DBlock(c1, c2, A, num_g3d_scales, window_stride=2)
+        self.sgcn2 = Sequential(
+            MSGCN(num_gcn_scales, c1, c1, A), MSTCN(c1, c2, stride=2),
+            MSTCN(c2, c2))
+        self.sgcn2[-1].act = nn.Identity()
+        self.tcn2 = MSTCN(c2, c2, tcn_dropout=tcn_dropout)
+
+        self.gcn3d3 = MW_MSG3DBlock(c2, c3, A, num_g3d_scales, window_stride=2)
+        self.sgcn3 = Sequential(
+            MSGCN(num_gcn_scales, c2, c2, A), MSTCN(c2, c3, stride=2),
+            MSTCN(c3, c3))
+        self.sgcn3[-1].act = nn.Identity()
+        self.tcn3 = MSTCN(c3, c3, tcn_dropout=tcn_dropout)
+
+    def forward(self, x):
+        N, M, T, V, C = x.size()
+        x = x.permute(0, 1, 3, 4, 2).contiguous().reshape(N, M * V * C, T)
+        x = self.data_bn(x)
+        x = x.reshape(N * M, V, C, T).permute(0, 2, 3, 1).contiguous()
+
+        # Apply activation to the sum of the pathways
+        x = F.relu(self.sgcn1(x) + self.gcn3d1(x), inplace=True)
+        x = self.tcn1(x)
+
+        x = F.relu(self.sgcn2(x) + self.gcn3d2(x), inplace=True)
+        x = self.tcn2(x)
+
+        x = F.relu(self.sgcn3(x) + self.gcn3d3(x), inplace=True)
+        x = self.tcn3(x)
+
+        # N * M, C, T, V
+        return x.reshape((N, M) + x.shape[1:])
diff --git a/projects/msg3d/models/msg3d_utils.py b/projects/msg3d/models/msg3d_utils.py
new file mode 100644
index 0000000000..25b4f953b6
--- /dev/null
+++ b/projects/msg3d/models/msg3d_utils.py
@@ -0,0 +1,342 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+
+from mmaction.models.utils import unit_tcn
+from mmaction.models.utils.graph import k_adjacency, normalize_digraph
+
+
+class MLP(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 act_cfg=dict(type='ReLU'),
+                 dropout=0):
+        super().__init__()
+        channels = [in_channels] + out_channels
+        self.layers = ModuleList()
+        for i in range(1, len(channels)):
+            if dropout > 1e-3:
+                self.layers.append(nn.Dropout(p=dropout))
+            self.layers.append(
+                nn.Conv2d(channels[i - 1], channels[i], kernel_size=1))
+            self.layers.append(nn.BatchNorm2d(channels[i]))
+            if act_cfg:
+                self.layers.append(build_activation_layer(act_cfg))
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+class MSGCN(BaseModule):
+
+    def __init__(self,
+                 num_scales,
+                 in_channels,
+                 out_channels,
+                 A,
+                 dropout=0,
+                 act_cfg=dict(type='ReLU')):
+        super().__init__()
+        self.num_scales = num_scales
+
+        A_powers = [
+            k_adjacency(A, k, with_self=True) for k in range(num_scales)
+        ]
+        A_powers = np.stack([normalize_digraph(g) for g in A_powers])
+
+        # K, V, V
+        self.register_buffer('A', torch.Tensor(A_powers))
+        self.PA = nn.Parameter(self.A.clone())
+        nn.init.uniform_(self.PA, -1e-6, 1e-6)
+
+        self.mlp = MLP(
+            in_channels * num_scales, [out_channels],
+            dropout=dropout,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        N, C, T, V = x.shape
+        A = self.A
+        A = A + self.PA
+
+        support = torch.einsum('kvu,nctv->nkctu', A, x)
+        support = support.reshape(N, self.num_scales * C, T, V)
+        out = self.mlp(support)
+        return out
+
+
+# ! Notice: The implementation of MSTCN in
+# MS-G3D is not the same as our implementation.
+class MSTCN(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 dilations=[1, 2, 3, 4],
+                 residual=True,
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=[
+                     dict(type='Constant', layer='BatchNorm2d', val=1),
+                     dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+                 ],
+                 tcn_dropout=0):
+
+        super().__init__(init_cfg=init_cfg)
+        # Multiple branches of temporal convolution
+        self.num_branches = len(dilations) + 2
+        branch_channels = out_channels // self.num_branches
+        branch_channels_rem = out_channels - branch_channels * (
+            self.num_branches - 1)
+
+        if type(kernel_size) == list:
+            assert len(kernel_size) == len(dilations)
+        else:
+            kernel_size = [kernel_size] * len(dilations)
+
+        self.branches = ModuleList([
+            Sequential(
+                nn.Conv2d(
+                    in_channels, branch_channels, kernel_size=1, padding=0),
+                nn.BatchNorm2d(branch_channels),
+                build_activation_layer(act_cfg),
+                unit_tcn(
+                    branch_channels,
+                    branch_channels,
+                    kernel_size=ks,
+                    stride=stride,
+                    dilation=dilation),
+            ) for ks, dilation in zip(kernel_size, dilations)
+        ])
+
+        # Additional Max & 1x1 branch
+        self.branches.append(
+            Sequential(
+                nn.Conv2d(
+                    in_channels, branch_channels, kernel_size=1, padding=0),
+                nn.BatchNorm2d(branch_channels),
+                build_activation_layer(act_cfg),
+                nn.MaxPool2d(
+                    kernel_size=(3, 1), stride=(stride, 1), padding=(1, 0)),
+                nn.BatchNorm2d(branch_channels)))
+
+        self.branches.append(
+            Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    branch_channels_rem,
+                    kernel_size=1,
+                    padding=0,
+                    stride=(stride, 1)), nn.BatchNorm2d(branch_channels_rem)))
+
+        # Residual connection
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+        else:
+            self.residual = unit_tcn(
+                in_channels, out_channels, kernel_size=1, stride=stride)
+
+        self.act = build_activation_layer(act_cfg)
+        self.drop = nn.Dropout(tcn_dropout)
+
+    def forward(self, x):
+        # Input dim: (N,C,T,V)
+        res = self.residual(x)
+        branch_outs = []
+        for tempconv in self.branches:
+            out = tempconv(x)
+            branch_outs.append(out)
+
+        out = torch.cat(branch_outs, dim=1)
+        out += res
+        out = self.act(out)
+        out = self.drop(out)
+        return out
+
+
+class UnfoldTemporalWindows(BaseModule):
+
+    def __init__(self, window_size, window_stride, window_dilation=1):
+        super().__init__()
+        self.window_size = window_size
+        self.window_stride = window_stride
+        self.window_dilation = window_dilation
+
+        self.padding = (window_size + (window_size - 1) *
+                        (window_dilation - 1) - 1) // 2
+        self.unfold = nn.Unfold(
+            kernel_size=(self.window_size, 1),
+            dilation=(self.window_dilation, 1),
+            stride=(self.window_stride, 1),
+            padding=(self.padding, 0))
+
+    def forward(self, x):
+        # Input shape: (N,C,T,V), out: (N,C,T,V*window_size)
+        N, C, T, V = x.shape
+        x = self.unfold(x)
+        # Permute extra channels from window size to the graph dimension;
+        # -1 for number of windows
+        x = x.reshape(N, C, self.window_size, -1, V).permute(0, 1, 3, 2,
+                                                             4).contiguous()
+        x = x.reshape(N, C, -1, self.window_size * V)
+        return x
+
+
+class ST_MSGCN(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 num_scales,
+                 window_size,
+                 residual=False,
+                 dropout=0,
+                 act_cfg=dict(type='ReLU')):
+
+        super().__init__()
+        self.num_scales = num_scales
+        self.window_size = window_size
+        A = self.build_st_graph(A, window_size)
+
+        A_scales = [
+            k_adjacency(A, k, with_self=True) for k in range(num_scales)
+        ]
+        A_scales = np.stack([normalize_digraph(g) for g in A_scales])
+
+        self.register_buffer('A', torch.Tensor(A_scales))
+        self.V = len(A)
+
+        self.PA = nn.Parameter(self.A.clone())
+        nn.init.uniform_(self.PA, -1e-6, 1e-6)
+
+        self.mlp = MLP(
+            in_channels * num_scales, [out_channels],
+            dropout=dropout,
+            act_cfg=act_cfg)
+
+        # Residual connection
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels):
+            self.residual = lambda x: x
+        else:
+            self.residual = MLP(in_channels, [out_channels], act_cfg=None)
+
+        self.act = build_activation_layer(act_cfg)
+
+    def build_st_graph(self, A, window_size):
+        if not isinstance(A, np.ndarray):
+            A = A.data.cpu().numpy()
+
+        assert len(A.shape) == 2 and A.shape[0] == A.shape[1]
+        V = len(A)
+        A_with_I = A + np.eye(V, dtype=A.dtype)
+
+        A_large = np.tile(A_with_I, (window_size, window_size)).copy()
+        return A_large
+
+    def forward(self, x):
+        N, C, T, V = x.shape  # T = number of windows, V = self.V * window_size
+        A = self.A + self.PA
+
+        # Perform Graph Convolution
+        res = self.residual(x)
+        agg = torch.einsum('kvu,nctv->nkctu', A, x)
+        agg = agg.reshape(N, self.num_scales * C, T, V)
+        out = self.mlp(agg)
+        if res == 0:
+            return self.act(out)
+        else:
+            return self.act(out + res)
+
+
+class MSG3DBlock(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 num_scales,
+                 window_size,
+                 window_stride,
+                 window_dilation,
+                 embed_factor=1,
+                 activation='relu'):
+
+        super().__init__()
+        self.window_size = window_size
+        self.out_channels = out_channels
+        self.embed_channels_in = out_channels // embed_factor
+        self.embed_channels_out = out_channels // embed_factor
+        if embed_factor == 1:
+            self.in1x1 = nn.Identity()
+            self.embed_channels_in = self.embed_channels_out = in_channels
+            # The first STGC block changes channels right away;
+            # others change at collapse
+            if in_channels == 3:
+                self.embed_channels_out = out_channels
+        else:
+            self.in1x1 = MLP(in_channels, [self.embed_channels_in])
+
+        self.gcn3d = Sequential(
+            UnfoldTemporalWindows(window_size, window_stride, window_dilation),
+            ST_MSGCN(
+                in_channels=self.embed_channels_in,
+                out_channels=self.embed_channels_out,
+                A=A,
+                num_scales=num_scales,
+                window_size=window_size))
+
+        self.out_conv = nn.Conv3d(
+            self.embed_channels_out,
+            out_channels,
+            kernel_size=(1, self.window_size, 1))
+        self.out_bn = nn.BatchNorm2d(out_channels)
+
+    def forward(self, x):
+        N, _, T, V = x.shape
+        x = self.in1x1(x)
+        # Construct temporal windows and apply MS-GCN
+        x = self.gcn3d(x)
+
+        # Collapse the window dimension
+        x = x.reshape(N, self.embed_channels_out, -1, self.window_size, V)
+        x = self.out_conv(x).squeeze(dim=3)
+        x = self.out_bn(x)
+        # no activation
+        return x
+
+
+class MW_MSG3DBlock(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 num_scales,
+                 window_sizes=[3, 5],
+                 window_stride=1,
+                 window_dilations=[1, 1]):
+
+        super().__init__()
+        self.gcn3d = ModuleList([
+            MSG3DBlock(in_channels, out_channels, A, num_scales, window_size,
+                       window_stride, window_dilation) for window_size,
+            window_dilation in zip(window_sizes, window_dilations)
+        ])
+
+    def forward(self, x):
+        out_sum = 0
+        for gcn3d in self.gcn3d:
+            out_sum += gcn3d(x)
+        return out_sum

From b292e0ddaa16648c1c394d2110968cb8b8d0d405 Mon Sep 17 00:00:00 2001
From: wxDai <daiwenxun@pjlab.org.cn>
Date: Tue, 14 Mar 2023 10:40:23 +0800
Subject: [PATCH 13/36] Add CTRGCN project (#2269)

---
 projects/ctrgcn/README.md                     | 143 +++++++++++++
 ...6-joint-u100-80e_ntu60-xsub-keypoint-2d.py | 104 ++++++++++
 ...6-joint-u100-80e_ntu60-xsub-keypoint-3d.py | 104 ++++++++++
 projects/ctrgcn/models/__init__.py            |   3 +
 projects/ctrgcn/models/ctrgcn.py              | 104 ++++++++++
 projects/ctrgcn/models/ctrgcn_utils.py        | 192 ++++++++++++++++++
 6 files changed, 650 insertions(+)
 create mode 100644 projects/ctrgcn/README.md
 create mode 100644 projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
 create mode 100644 projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
 create mode 100644 projects/ctrgcn/models/__init__.py
 create mode 100644 projects/ctrgcn/models/ctrgcn.py
 create mode 100644 projects/ctrgcn/models/ctrgcn_utils.py

diff --git a/projects/ctrgcn/README.md b/projects/ctrgcn/README.md
new file mode 100644
index 0000000000..809af449f5
--- /dev/null
+++ b/projects/ctrgcn/README.md
@@ -0,0 +1,143 @@
+# CTRGCN Project
+
+[Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition](https://arxiv.org/abs/2107.12213)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Graph convolutional networks (GCNs) have been widely used and achieved remarkable results in skeleton-based action recognition. In GCNs, graph topology dominates feature aggregation and therefore is the key to extracting representative features. In this work, we propose a novel Channel-wise Topology Refinement Graph Convolution (CTR-GC) to dynamically learn different topologies and effectively aggregate joint features in different channels for skeleton-based action recognition. The proposed CTR-GC models channel-wise topologies through learning a shared topology as a generic prior for all channels and refining it with channel-specific correlations for each channel. Our refinement method introduces few extra parameters and significantly reduces the difficulty of modeling channel-wise topologies. Furthermore, via reformulating graph convolutions into a unified form, we find that CTR-GC relaxes strict constraints of graph convolutions, leading to stronger representation capability. Combining CTR-GC with temporal modeling modules, we develop a powerful graph convolutional network named CTR-GCN which notably outperforms state-of-the-art methods on the NTU RGB+D, NTU RGB+D 120, and NW-UCLA datasets.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/58767402/223147561-9158fd51-8963-47c9-9338-de70470820cc.png" width="800"/>
+</div>
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2.
+
+At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md).
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher slurm \
+    --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+### NTU60_XSub_2D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol |                     config                     |                     ckpt                     |                     log                     |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+|       uniform 100       |  joint   |  8   |  CTRGCN  |   89.6   |     10 clips     | [config](./configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20230308-7aba454e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) |
+
+### NTU60_XSub_3D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol |                     config                     |                     ckpt                     |                     log                     |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+|       uniform 100       |  joint   |  8   |  CTRGCN  |   89.0   |     10 clips     | [config](./configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20230308-950dca0a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) |
+
+## Citation
+
+<!-- Replace to the citation of the paper your project refers to. -->
+
+```bibtex
+@inproceedings{chen2021channel,
+  title={Channel-wise topology refinement graph convolution for skeleton-based action recognition},
+  author={Chen, Yuxin and Zhang, Ziqi and Yuan, Chunfeng and Li, Bing and Deng, Ying and Hu, Weiming},
+  booktitle={CVPR},
+  pages={13359--13368},
+  year={2021}
+}
+```
+
+## Checklist
+
+Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects.
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+    <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmaction.registry.MODELS` and configurable via a config file. -->
+
+  - [x] Basic docstrings & proper citation
+
+    <!-- Each major class should contains a docstring, describing its functionality and arguments. If your code is copied or modified from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [x] Converted checkpoint and results (Only for reproduction)
+
+    <!-- If you are reproducing the result from a paper, make sure the model in the project can match that results. Also please provide checkpoint links or a checkpoint conversion script for others to get the pre-trained model. -->
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training results
+
+    <!-- If you are reproducing the result from a paper, train your model from scratch and verified that the final result can match the original result. Usually, ±0.1% is acceptable for the action recognition task on Kinetics400. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Unit tests
+
+    <!-- Unit tests for the major module are required. [Example](https://github.com/open-mmlab/mmaction2/blob/1.x/tests/models/backbones/test_resnet.py) -->
+
+  - [ ] Code style
+
+    <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] `metafile.yml` and `README.md`
+
+    <!-- It will used for MMAction2 to acquire your models. [Example](https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/swin/metafile.yml). In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/swin/README.md) -->
diff --git a/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
new file mode 100644
index 0000000000..4dd8629837
--- /dev/null
+++ b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+    type='RecognizerGCN',
+    backbone=dict(
+        type='CTRGCN', graph_cfg=dict(layout='coco', mode='spatial')),
+    cls_head=dict(type='GCNHead', num_classes=60, in_channels=256))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+train_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(type='UniformSampleFrames', clip_len=100),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='PreNormalize2D'),
+    dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=10,
+        test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            pipeline=train_pipeline,
+            split='xsub_train')))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=val_pipeline,
+        split='xsub_val',
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=test_pipeline,
+        split='xsub_val',
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=16,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
new file mode 100644
index 0000000000..7ae499b4ce
--- /dev/null
+++ b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+    type='RecognizerGCN',
+    backbone=dict(
+        type='CTRGCN', graph_cfg=dict(layout='nturgb+d', mode='spatial')),
+    cls_head=dict(type='GCNHead', num_classes=60, in_channels=256))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_3d.pkl'
+train_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(type='UniformSampleFrames', clip_len=100),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='PreNormalize3D'),
+    dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+    dict(
+        type='UniformSampleFrames', clip_len=100, num_clips=10,
+        test_mode=True),
+    dict(type='PoseDecode'),
+    dict(type='FormatGCNInput', num_person=2),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            ann_file=ann_file,
+            pipeline=train_pipeline,
+            split='xsub_train')))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=val_pipeline,
+        split='xsub_val',
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        pipeline=test_pipeline,
+        split='xsub_val',
+        test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        eta_min=0,
+        T_max=16,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/ctrgcn/models/__init__.py b/projects/ctrgcn/models/__init__.py
new file mode 100644
index 0000000000..71958fdd44
--- /dev/null
+++ b/projects/ctrgcn/models/__init__.py
@@ -0,0 +1,3 @@
+from .ctrgcn import CTRGCN
+
+__all__ = ['CTRGCN']
diff --git a/projects/ctrgcn/models/ctrgcn.py b/projects/ctrgcn/models/ctrgcn.py
new file mode 100644
index 0000000000..c6056071ea
--- /dev/null
+++ b/projects/ctrgcn/models/ctrgcn.py
@@ -0,0 +1,104 @@
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule, ModuleList
+
+from mmaction.models.utils import Graph, unit_tcn
+from mmaction.registry import MODELS
+from .ctrgcn_utils import MSTCN, unit_ctrgcn
+
+
+class CTRGCNBlock(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 stride=1,
+                 residual=True,
+                 kernel_size=5,
+                 dilations=[1, 2],
+                 tcn_dropout=0):
+        super(CTRGCNBlock, self).__init__()
+        self.gcn1 = unit_ctrgcn(in_channels, out_channels, A)
+        self.tcn1 = MSTCN(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilations=dilations,
+            residual=False,
+            tcn_dropout=tcn_dropout)
+        self.relu = nn.ReLU(inplace=True)
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+        else:
+            self.residual = unit_tcn(
+                in_channels, out_channels, kernel_size=1, stride=stride)
+
+    def forward(self, x):
+        y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x))
+        return y
+
+
+@MODELS.register_module()
+class CTRGCN(BaseModule):
+
+    def __init__(self,
+                 graph_cfg,
+                 in_channels=3,
+                 base_channels=64,
+                 num_stages=10,
+                 inflate_stages=[5, 8],
+                 down_stages=[5, 8],
+                 pretrained=None,
+                 num_person=2,
+                 **kwargs):
+        super(CTRGCN, self).__init__()
+
+        self.graph = Graph(**graph_cfg)
+        A = torch.tensor(
+            self.graph.A, dtype=torch.float32, requires_grad=False)
+        self.register_buffer('A', A)
+
+        self.num_person = num_person
+        self.base_channels = base_channels
+
+        self.data_bn = nn.BatchNorm1d(num_person * in_channels * A.size(1))
+
+        kwargs0 = {k: v for k, v in kwargs.items() if k != 'tcn_dropout'}
+        modules = [
+            CTRGCNBlock(
+                in_channels,
+                base_channels,
+                A.clone(),
+                residual=False,
+                **kwargs0)
+        ]
+        for i in range(2, num_stages + 1):
+            in_channels = base_channels
+            out_channels = base_channels * (1 + (i in inflate_stages))
+            stride = 1 + (i in down_stages)
+            modules.append(
+                CTRGCNBlock(
+                    base_channels,
+                    out_channels,
+                    A.clone(),
+                    stride=stride,
+                    **kwargs))
+            base_channels = out_channels
+        self.net = ModuleList(modules)
+
+    def forward(self, x):
+        N, M, T, V, C = x.size()
+        x = x.permute(0, 1, 3, 4, 2).contiguous()
+        x = self.data_bn(x.view(N, M * V * C, T))
+        x = x.view(N, M, V, C, T).permute(0, 1, 3, 4,
+                                          2).contiguous().view(N * M, C, T, V)
+
+        for gcn in self.net:
+            x = gcn(x)
+
+        x = x.reshape((N, M) + x.shape[1:])
+        return x
diff --git a/projects/ctrgcn/models/ctrgcn_utils.py b/projects/ctrgcn/models/ctrgcn_utils.py
new file mode 100644
index 0000000000..52665e8567
--- /dev/null
+++ b/projects/ctrgcn/models/ctrgcn_utils.py
@@ -0,0 +1,192 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+
+from mmaction.models.utils import unit_tcn
+
+
+# ! Notice: The implementation of MSTCN in
+# MS-G3D is not the same as our implementation.
+class MSTCN(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 dilations=[1, 2, 3, 4],
+                 residual=True,
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=[
+                     dict(type='Constant', layer='BatchNorm2d', val=1),
+                     dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+                 ],
+                 tcn_dropout=0):
+
+        super().__init__(init_cfg=init_cfg)
+        # Multiple branches of temporal convolution
+        self.num_branches = len(dilations) + 2
+        branch_channels = out_channels // self.num_branches
+        branch_channels_rem = out_channels - branch_channels * (
+            self.num_branches - 1)
+
+        if type(kernel_size) == list:
+            assert len(kernel_size) == len(dilations)
+        else:
+            kernel_size = [kernel_size] * len(dilations)
+
+        self.branches = ModuleList([
+            Sequential(
+                nn.Conv2d(
+                    in_channels, branch_channels, kernel_size=1, padding=0),
+                nn.BatchNorm2d(branch_channels),
+                build_activation_layer(act_cfg),
+                unit_tcn(
+                    branch_channels,
+                    branch_channels,
+                    kernel_size=ks,
+                    stride=stride,
+                    dilation=dilation),
+            ) for ks, dilation in zip(kernel_size, dilations)
+        ])
+
+        # Additional Max & 1x1 branch
+        self.branches.append(
+            Sequential(
+                nn.Conv2d(
+                    in_channels, branch_channels, kernel_size=1, padding=0),
+                nn.BatchNorm2d(branch_channels),
+                build_activation_layer(act_cfg),
+                nn.MaxPool2d(
+                    kernel_size=(3, 1), stride=(stride, 1), padding=(1, 0)),
+                nn.BatchNorm2d(branch_channels)))
+
+        self.branches.append(
+            Sequential(
+                nn.Conv2d(
+                    in_channels,
+                    branch_channels_rem,
+                    kernel_size=1,
+                    padding=0,
+                    stride=(stride, 1)), nn.BatchNorm2d(branch_channels_rem)))
+
+        # Residual connection
+        if not residual:
+            self.residual = lambda x: 0
+        elif (in_channels == out_channels) and (stride == 1):
+            self.residual = lambda x: x
+        else:
+            self.residual = unit_tcn(
+                in_channels, out_channels, kernel_size=1, stride=stride)
+
+        self.act = build_activation_layer(act_cfg)
+        self.drop = nn.Dropout(tcn_dropout)
+
+    def forward(self, x):
+        # Input dim: (N,C,T,V)
+        res = self.residual(x)
+        branch_outs = []
+        for tempconv in self.branches:
+            out = tempconv(x)
+            branch_outs.append(out)
+
+        out = torch.cat(branch_outs, dim=1)
+        out += res
+        out = self.act(out)
+        out = self.drop(out)
+        return out
+
+
+class CTRGC(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 rel_reduction=8,
+                 init_cfg=[
+                     dict(type='Constant', layer='BatchNorm2d', val=1),
+                     dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+                 ]):
+        super(CTRGC, self).__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if in_channels <= 16:
+            self.rel_channels = 8
+        else:
+            self.rel_channels = in_channels // rel_reduction
+        self.conv1 = nn.Conv2d(
+            self.in_channels, self.rel_channels, kernel_size=1)
+        self.conv2 = nn.Conv2d(
+            self.in_channels, self.rel_channels, kernel_size=1)
+        self.conv3 = nn.Conv2d(
+            self.in_channels, self.out_channels, kernel_size=1)
+        self.conv4 = nn.Conv2d(
+            self.rel_channels, self.out_channels, kernel_size=1)
+        self.tanh = nn.Tanh()
+
+    def forward(self, x, A=None, alpha=1):
+        # Input: N, C, T, V
+        x1, x2, x3 = self.conv1(x).mean(-2), self.conv2(x).mean(
+            -2), self.conv3(x)
+        # X1, X2: N, R, V
+        # N, R, V, 1 - N, R, 1, V
+        x1 = self.tanh(x1.unsqueeze(-1) - x2.unsqueeze(-2))
+        # N, R, V, V
+        x1 = self.conv4(x1) * alpha + (A[None, None] if A is not None else 0
+                                       )  # N,C,V,V
+        x1 = torch.einsum('ncuv,nctu->nctv', x1, x3)
+        return x1
+
+
+class unit_ctrgcn(BaseModule):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 A,
+                 init_cfg=[
+                     dict(
+                         type='Constant',
+                         layer='BatchNorm2d',
+                         val=1,
+                         override=dict(type='Constant', name='bn', val=1e-6)),
+                     dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+                 ]):
+
+        super(unit_ctrgcn, self).__init__(init_cfg=init_cfg)
+        inter_channels = out_channels // 4
+        self.inter_c = inter_channels
+        self.out_c = out_channels
+        self.in_c = in_channels
+
+        self.num_subset = A.shape[0]
+        self.convs = ModuleList()
+
+        for i in range(self.num_subset):
+            self.convs.append(CTRGC(in_channels, out_channels))
+
+        if in_channels != out_channels:
+            self.down = Sequential(
+                nn.Conv2d(in_channels, out_channels, 1),
+                nn.BatchNorm2d(out_channels))
+        else:
+            self.down = lambda x: x
+
+        self.A = nn.Parameter(A.clone())
+
+        self.alpha = nn.Parameter(torch.zeros(1))
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.soft = nn.Softmax(-2)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        y = None
+
+        for i in range(self.num_subset):
+            z = self.convs[i](x, self.A[i], self.alpha)
+            y = z + y if y is not None else z
+
+        y = self.bn(y)
+        y += self.down(x)
+        return self.relu(y)

From 8c76fbd6eb275df4595561e288ba491a09a2806d Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Tue, 14 Mar 2023 10:57:41 +0800
Subject: [PATCH 14/36] [fix] fix ntu_pose_extraction (#2246)

---
 ...er-rcnn_r50-caffe_fpn_ms-1x_coco-person.py | 140 ++++++++++++++++++
 mmaction/apis/inference.py                    |  17 ++-
 mmaction/utils/misc.py                        |   2 +-
 tools/data/skeleton/README.md                 |  12 +-
 tools/data/skeleton/README_zh-CN.md           |  23 ++-
 tools/data/skeleton/ntu_pose_extraction.py    | 121 ++++-----------
 6 files changed, 207 insertions(+), 108 deletions(-)
 create mode 100644 demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py

diff --git a/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py
new file mode 100644
index 0000000000..934a3a5bc4
--- /dev/null
+++ b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+model = dict(
+    type='FasterRCNN',
+    _scope_='mmdet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[103.53, 116.28, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[0.0, 0.0, 0.0, 0.0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=1,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0.0, 0.0, 0.0, 0.0],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+file_client_args = dict(backend='disk')
+
+test_pipeline = [
+    dict(type='mmdet.LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='mmdet.Resize', scale=(1333, 800), keep_ratio=True),
+    dict(
+        type='mmdet.PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CocoDataset',
+        data_root='data/coco/',
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))
diff --git a/mmaction/apis/inference.py b/mmaction/apis/inference.py
index ac014d0350..d0a4c01501 100644
--- a/mmaction/apis/inference.py
+++ b/mmaction/apis/inference.py
@@ -104,7 +104,8 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config],
                         frame_paths: List[str],
                         det_score_thr: float = 0.9,
                         det_cat_id: int = 0,
-                        device: Union[str, torch.device] = 'cuda:0') -> tuple:
+                        device: Union[str, torch.device] = 'cuda:0',
+                        with_score: bool = False) -> tuple:
     """Detect human boxes given frame paths.
 
     Args:
@@ -117,6 +118,8 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config],
         det_cat_id (int): The category id for human detection. Defaults to 0.
         device (Union[str, torch.device]): The desired device of returned
             tensor. Defaults to ``'cuda:0'``.
+        with_score (bool): Whether to append detection score after box.
+            Defaults to None.
 
     Returns:
         List[np.ndarray]: List of detected human boxes.
@@ -141,10 +144,16 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config],
         det_data_sample: DetDataSample = inference_detector(model, frame_path)
         pred_instance = det_data_sample.pred_instances.cpu().numpy()
         bboxes = pred_instance.bboxes
+        scores = pred_instance.scores
         # We only keep human detection bboxs with score larger
         # than `det_score_thr` and category id equal to `det_cat_id`.
-        bboxes = bboxes[np.logical_and(pred_instance.labels == det_cat_id,
-                                       pred_instance.scores > det_score_thr)]
+        valid_idx = np.logical_and(pred_instance.labels == det_cat_id,
+                                   pred_instance.scores > det_score_thr)
+        bboxes = bboxes[valid_idx]
+        scores = scores[valid_idx]
+
+        if with_score:
+            bboxes = np.concatenate((bboxes, scores[:, None]), axis=-1)
         results.append(bboxes)
         data_samples.append(det_data_sample)
 
@@ -187,7 +196,7 @@ def pose_inference(pose_config: Union[str, Path, mmengine.Config],
     print('Performing Human Pose Estimation for each frame')
     for f, d in track_iter_progress(list(zip(frame_paths, det_results))):
         pose_data_samples: List[PoseDataSample] \
-            = inference_topdown(model, f, d, bbox_format='xyxy')
+            = inference_topdown(model, f, d[..., :4], bbox_format='xyxy')
         pose_data_sample = merge_data_samples(pose_data_samples)
         pose_data_sample.dataset_meta = model.dataset_meta
         poses = pose_data_sample.pred_instances.to_dict()
diff --git a/mmaction/utils/misc.py b/mmaction/utils/misc.py
index f14b8a51c2..bf4358a2f4 100644
--- a/mmaction/utils/misc.py
+++ b/mmaction/utils/misc.py
@@ -42,7 +42,7 @@ def frame_extract(video_path: str,
     Args:
         video_path (str): The video path.
         short_side (int): Target short-side of the output image.
-            Defaults to None, means keep original shape.
+            Defaults to None, means keeping original shape.
         out_dir (str): The output directory. Defaults to ``'./tmp'``.
     """
     # Load the video, extract frames into OUT_DIR/video_name
diff --git a/tools/data/skeleton/README.md b/tools/data/skeleton/README.md
index 3ada42e8ef..10244d23a1 100644
--- a/tools/data/skeleton/README.md
+++ b/tools/data/skeleton/README.md
@@ -26,13 +26,21 @@ We provide links to the pre-processed skeleton annotations, you can directly dow
 - NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl
 - NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl
 - GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl
-  - GYM 2D skeletons are extracted with ground-truth human bounding boxes, which can be downloaded with link: https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl. Please cite [PoseConv3D](https://arxiv.org/abs/2104.13586) if you use it in your project.
+  - GYM 2D skeletons are extracted with ground-truth human bounding boxes, which can be downloaded with [link](https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl). Please cite [PoseConv3D](https://arxiv.org/abs/2104.13586) if you use it in your project.
 - UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl
 - HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl
 - Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl
 - Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (Table of contents only, no skeleton annotations)
 
-For Kinetics400, since the skeleton annotations are large, we do not provide the direct download links on aliyun. Please use the following link to download the `kpfiles` and extract it under `$MMACTION2/data/k400` for Kinetics-400 training & testing: https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EeyDCVskqLtClMVVwqD53acBF2FEwkctp3vtRbkLfnKSTw?e=B3SZlM
+For Kinetics400, since the skeleton annotations are large, we do not provide the direct download links on aliyun. Please use the following link to download the `kpfiles` and extract it under `$MMACTION2/data/k400` for Kinetics400 training & testing: https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EeyDCVskqLtClMVVwqD53acBF2FEwkctp3vtRbkLfnKSTw?e=B3SZlM
+
+If you want to generate 2D skeleton annotations of specified video, please install mmdetection and mmpose first, then use the following script to extract skeleton annotations of NTURGB+D video:
+
+```python
+python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl
+```
+
+please note that, due to the upgrade of mmpose, the inference results may have slight differences from the provided skeleton annotations.
 
 ## The Format of Annotations
 
diff --git a/tools/data/skeleton/README_zh-CN.md b/tools/data/skeleton/README_zh-CN.md
index fb6de5925a..3754175908 100644
--- a/tools/data/skeleton/README_zh-CN.md
+++ b/tools/data/skeleton/README_zh-CN.md
@@ -33,20 +33,27 @@ bash download_annotations.sh ${DATASET}
 
 对于无法进行姿态提取的用户，这里提供了上述流程的输出结果，分别对应 NTURGB-D 数据集的 4 个部分：
 
-- ntu60_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_train.pkl
-- ntu60_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_val.pkl
-- ntu120_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_train.pkl
-- ntu120_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_val.pkl
-- hmdb51: https://download.openmmlab.com/mmaction/posec3d/hmdb51.pkl
-- ucf101: https://download.openmmlab.com/mmaction/posec3d/ucf101.pkl
+- NTURGB+D \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_2d.pkl
+- NTURGB+D \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_3d.pkl
+- NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl
+- NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl
+- GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl
+  - GYM 2D 姿态标注文件是基于运动员的真实标注框生成的，用户可以从这个[链接](https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl)下载真实标注框。如果你在项目中使用了该数据，请引用 [PoseConv3D](https://arxiv.org/abs/2104.13586)
+- UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl
+- HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl
+- Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl
+- Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (只包含数据列表，没有姿态标注文件)
 
-若想生成单个视频的 2D 姿态标注文件，首先，用户需要由源码安装 mmdetection 和 mmpose。之后，用户需要在 `ntu_pose_extraction.py` 中指定 `mmdet_root` 和 `mmpose_root` 变量。
-最后，用户可使用以下脚本进行 NTURGB+D 视频的姿态提取：
+由于 Kinetics400 数据集姿态标注文件过大，我们不提供阿里云的下载链接，请使用此[链接](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EeyDCVskqLtClMVVwqD53acBF2FEwkctp3vtRbkLfnKSTw?e=B3SZlM)下载 `kpfiles`，解压到 `$MMACTION2/data/k400` 目录下，用于 Kinetics400 的训练和测试。
+
+若想生成单个视频的 2D 姿态标注文件，用户在安装 mmdetection 和 mmpose 之后，可使用以下脚本进行 NTURGB+D 视频的姿态提取：
 
 ```python
 python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl
 ```
 
+请注意，由于 mmpose 算法库升级，此脚本的推理结果与提供的姿态点数据集可能略有差异。
+
 在用户获得数据集某部分所有视频的姿态标注文件（如 `ntu60_xsub_val`）后，可以将其集合成一个 list 数据并保存为 `ntu60_xsub_val.pkl`。用户可用这些大型 pickle 文件进行训练和测试。
 
 ## PoseC3D 的标注文件格式
diff --git a/tools/data/skeleton/ntu_pose_extraction.py b/tools/data/skeleton/ntu_pose_extraction.py
index 17af16e749..d60fefdd97 100644
--- a/tools/data/skeleton/ntu_pose_extraction.py
+++ b/tools/data/skeleton/ntu_pose_extraction.py
@@ -1,82 +1,24 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import abc
 import argparse
-import os
 import os.path as osp
-import random as rd
-import shutil
-import string
 from collections import defaultdict
+from tempfile import TemporaryDirectory
 
-import cv2
-import mmcv
+import mmengine
 import numpy as np
 
-try:
-    from mmdet.apis import inference_detector, init_detector
-except (ImportError, ModuleNotFoundError):
-    raise ImportError('Failed to import `inference_detector` and '
-                      '`init_detector` form `mmdet.apis`. These apis are '
-                      'required in this script! ')
-
-try:
-    from mmpose.apis import inference_top_down_pose_model, init_pose_model
-except (ImportError, ModuleNotFoundError):
-    raise ImportError('Failed to import `inference_top_down_pose_model` and '
-                      '`init_pose_model` form `mmpose.apis`. These apis are '
-                      'required in this script! ')
-
-mmdet_root = ''
-mmpose_root = ''
+from mmaction.apis import detection_inference, pose_inference
+from mmaction.utils import frame_extract
 
 args = abc.abstractproperty()
-args.det_config = f'{mmdet_root}/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py'  # noqa: E501
+args.det_config = 'demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py'  # noqa: E501
 args.det_checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'  # noqa: E501
 args.det_score_thr = 0.5
-args.pose_config = f'{mmpose_root}/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py'  # noqa: E501
+args.pose_config = 'demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py'  # noqa: E501
 args.pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth'  # noqa: E501
 
 
-def gen_id(size=8):
-    chars = string.ascii_uppercase + string.digits
-    return ''.join(rd.choice(chars) for _ in range(size))
-
-
-def extract_frame(video_path):
-    dname = gen_id()
-    os.makedirs(dname, exist_ok=True)
-    frame_tmpl = osp.join(dname, 'img_{:05d}.jpg')
-    vid = cv2.VideoCapture(video_path)
-    frame_paths = []
-    flag, frame = vid.read()
-    cnt = 0
-    while flag:
-        frame_path = frame_tmpl.format(cnt + 1)
-        frame_paths.append(frame_path)
-
-        cv2.imwrite(frame_path, frame)
-        cnt += 1
-        flag, frame = vid.read()
-
-    return frame_paths
-
-
-def detection_inference(args, frame_paths):
-    model = init_detector(args.det_config, args.det_checkpoint, args.device)
-    assert model.CLASSES[0] == 'person', ('We require you to use a detector '
-                                          'trained on COCO')
-    results = []
-    print('Performing Human Detection for each frame')
-    prog_bar = mmcv.ProgressBar(len(frame_paths))
-    for frame_path in frame_paths:
-        result = inference_detector(model, frame_path)
-        # We only keep human detections with score larger than det_score_thr
-        result = result[0][result[0][:, 4] >= args.det_score_thr]
-        results.append(result)
-        prog_bar.update()
-    return results
-
-
 def intersection(b0, b1):
     l, r = max(b0[0], b1[0]), min(b0[2], b1[2])
     u, d = max(b0[1], b1[1]), min(b0[3], b1[3])
@@ -227,7 +169,7 @@ def tracklets2bbox(tracklet, num_frame):
                     mind = np.abs(k - idx)
                     mink = k
             bbox[idx] = bboxd[mink]
-    return bad, bbox
+    return bad, bbox[:, None, :]
 
 
 def bboxes2bbox(bbox, num_frame):
@@ -287,41 +229,34 @@ def ntu_det_postproc(vid, det_results):
         return bboxes2bbox(det_results, len(det_results))
 
 
-def pose_inference(args, frame_paths, det_results):
-    model = init_pose_model(args.pose_config, args.pose_checkpoint,
-                            args.device)
-    print('Performing Human Pose Estimation for each frame')
-    prog_bar = mmcv.ProgressBar(len(frame_paths))
-
-    num_frame = len(det_results)
-    num_person = max([len(x) for x in det_results])
-    kp = np.zeros((num_person, num_frame, 17, 3), dtype=np.float32)
-
-    for i, (f, d) in enumerate(zip(frame_paths, det_results)):
-        # Align input format
-        d = [dict(bbox=x) for x in list(d) if x[-1] > 0.5]
-        pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0]
-        for j, item in enumerate(pose):
-            kp[j, i] = item['keypoints']
-        prog_bar.update()
-    return kp
-
-
 def ntu_pose_extraction(vid, skip_postproc=False):
-    frame_paths = extract_frame(vid)
-    det_results = detection_inference(args, frame_paths)
+    tmp_dir = TemporaryDirectory()
+    frame_paths, _ = frame_extract(vid, out_dir=tmp_dir.name)
+    det_results, _ = detection_inference(
+        args.det_config,
+        args.det_checkpoint,
+        frame_paths,
+        args.det_score_thr,
+        device=args.device,
+        with_score=True)
+
     if not skip_postproc:
         det_results = ntu_det_postproc(vid, det_results)
-    pose_results = pose_inference(args, frame_paths, det_results)
+    pose_results, _ = pose_inference(args.pose_config, args.pose_checkpoint,
+                                     frame_paths, det_results, args.device)
+
     anno = dict()
-    anno['keypoint'] = pose_results[..., :2]
-    anno['keypoint_score'] = pose_results[..., 2]
+    anno['keypoint'] = np.stack(
+        [pose['keypoints'].astype(np.float32) for pose in pose_results],
+        axis=1)
+    anno['keypoint_score'] = np.stack(
+        [pose['keypoint_scores'] for pose in pose_results], axis=1)
     anno['frame_dir'] = osp.splitext(osp.basename(vid))[0]
     anno['img_shape'] = (1080, 1920)
     anno['original_shape'] = (1080, 1920)
-    anno['total_frames'] = pose_results.shape[1]
+    anno['total_frames'] = len(pose_results)
     anno['label'] = int(osp.basename(vid).split('A')[1][:3]) - 1
-    shutil.rmtree(osp.dirname(frame_paths[0]))
+    tmp_dir.cleanup()
 
     return anno
 
@@ -344,4 +279,4 @@ def parse_args():
     args.output = global_args.output
     args.skip_postproc = global_args.skip_postproc
     anno = ntu_pose_extraction(args.video, args.skip_postproc)
-    mmcv.dump(anno, args.output)
+    mmengine.dump(anno, args.output)

From d6dd49137d9f6b284ff45929a397294a789fc9f3 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Thu, 16 Mar 2023 20:56:18 +0800
Subject: [PATCH 15/36] [doc] cancel compile pdf docs (#2302)

---
 .readthedocs.yml                              | 3 ++-
 tests/models/recognizers/test_recognizer2d.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 6cfbf5d310..070c61832b 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,6 +1,7 @@
 version: 2
 
-formats: all
+formats:
+    - epub
 
 python:
   version: 3.7
diff --git a/tests/models/recognizers/test_recognizer2d.py b/tests/models/recognizers/test_recognizer2d.py
index 300e63b460..a1c8ef4b1f 100644
--- a/tests/models/recognizers/test_recognizer2d.py
+++ b/tests/models/recognizers/test_recognizer2d.py
@@ -190,7 +190,7 @@ def test_tpn():
 
     recognizer = MODELS.build(config.model)
 
-    input_shape = (1, 8, 3, 224, 224)
+    input_shape = (1, 8, 3, 32, 32)
     demo_inputs = generate_recognizer_demo_inputs(input_shape)
 
     imgs = demo_inputs['imgs']

From dabe21abadce5435c85c45454083375ffa848082 Mon Sep 17 00:00:00 2001
From: LinXiaoZheng <90811472+Zheng-LinXiao@users.noreply.github.com>
Date: Wed, 22 Mar 2023 14:05:57 +0800
Subject: [PATCH 16/36] [Docs] Add the docs about readme_zh-CN.md (#2252)

---
 README.md       |  23 +++-
 README_zh-CN.md | 317 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 337 insertions(+), 3 deletions(-)
 create mode 100644 README_zh-CN.md

diff --git a/README.md b/README.md
index ab41e0f96e..f3a575f4ce 100644
--- a/README.md
+++ b/README.md
@@ -48,10 +48,12 @@
     <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
 </div>
 
+English | [简体中文](/README_zh-CN.md)
+
 ## Introduction
 
 MMAction2 is an open-source toolbox for video understanding based on PyTorch.
-It is a part of the [OpenMMLab](http://openmmlab.org/) project.
+It is a part of the [OpenMMLab](http://openmmlab.com/) project.
 
 The 1.x branch works with **PyTorch 1.6+**.
 
@@ -84,7 +86,7 @@ The 1.x branch works with **PyTorch 1.6+**.
 
 ## What's New
 
-**Release (2022.02.10)**: v1.0.0rc3 with the following new features:
+**Release (2023.02.10)**: v1.0.0rc3 with the following new features:
 
 - Support Action Recognition model UniFormer V1(ICLR'2022), UniFormer V2(Arxiv'2022).
 - Support training MViT V2(CVPR'2022), and MaskFeat(CVPR'2022) fine-tuning.
@@ -94,6 +96,20 @@ The 1.x branch works with **PyTorch 1.6+**.
 
 Please refer to [install.md](https://mmaction2.readthedocs.io/en/1.x/get_started.html) for more detailed instructions.
 
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate open-mmlab
+conda install pytorch torchvision -c pytorch  # This command will automatically install the latest version PyTorch and cudatoolkit, please check whether they match your environment.
+pip install -U openmim
+mim install mmengine 'mmcv>=2.0.0rc1'
+mim install "mmdet>=3.0.0rc5"  # optional
+mim install "mmpose>=1.0.0rc0"  # optional
+git clone https://github.com/open-mmlab/mmaction2.git
+cd mmaction2
+git checkout 1.x
+pip3 install -e .
+```
+
 ## Supported Methods
 
 <table style="margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;">
@@ -271,7 +287,7 @@ If you find this project useful in your research, please consider cite:
 
 ## Contributing
 
-We appreciate all contributions to improve MMAction2. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/1.x/CONTRIBUTING.md) in MMCV for more details about the contributing guideline.
+We appreciate all contributions to improve MMAction2. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING.md) in MMCV for more details about the contributing guideline.
 
 ## Acknowledgement
 
@@ -287,6 +303,7 @@ We wish that the toolbox and benchmark could serve the growing research communit
 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
 - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
 - [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
diff --git a/README_zh-CN.md b/README_zh-CN.md
new file mode 100644
index 0000000000..5d0d091cd1
--- /dev/null
+++ b/README_zh-CN.md
@@ -0,0 +1,317 @@
+<div align="center">
+  <img src="https://github.com/open-mmlab/mmaction2/raw/1.x/resources/mmaction2_logo.png" width="600"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab 官网</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab 开放平台</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+
+[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/1.x/)
+[![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2)
+[![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/)
+[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/blob/master/LICENSE)
+[![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
+[![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
+
+[📘文档](https://mmaction2.readthedocs.io/zh_CN//1.x/) |
+[🛠️安装指南](https://mmaction2.readthedocs.io/zh_CN//1.x/get_started.html) |
+[👀模型库](https://mmaction2.readthedocs.io/zh_CN//1.x/modelzoo.html) |
+[🆕更新](https://mmaction2.readthedocs.io/zh_CN/1.x/notes/changelog.html) |
+[🚀进行中项目](https://github.com/open-mmlab/mmaction2/projects) |
+[🤔问题反馈](https://github.com/open-mmlab/mmaction2/issues/new/choose)
+
+</div>
+
+<div align="center">
+  <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218352562-cdded397-b0f3-4ca1-b8dd-a60df8dca75b.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://discord.com/channels/1037617289144569886/1046608014234370059" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://twitter.com/OpenMMLab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
+</div>
+
+[English](/README.md) | 简体中文
+
+## 简介
+
+MMAction2 是一款基于 PyTorch 的视频理解开源工具箱，是 [OpenMMLab](https://openmmlab.com/) 项目的成员之一
+
+1.x 分支代码目前支持 **PyTorch 1.6以上** 的版本
+
+<div align="center">
+  <div style="float:left;margin-right:10px;">
+  <img src="https://github.com/open-mmlab/mmaction2/raw/1.x/resources/mmaction2_overview.gif" width="380px"><br>
+    <p style="font-size:1.5vw;">Kinetics-400 上的动作识别</p>
+  </div>
+  <div style="float:right;margin-right:0px;">
+  <img src="https://user-images.githubusercontent.com/34324155/123989146-2ecae680-d9fb-11eb-916b-b9db5563a9e5.gif" width="380px"><br>
+    <p style="font-size:1.5vw;">NTURGB+D-120 上的基于人体姿态的动作识别</p>
+  </div>
+</div>
+<div align="center">
+  <img src="https://user-images.githubusercontent.com/30782254/155710881-bb26863e-fcb4-458e-b0c4-33cd79f96901.gif" width="580px"/><br>
+    <p style="font-size:1.5vw;">Kinetics-400 上的基于 skeleton 的时空动作检测和动作识别</p>
+</div>
+<div align="center">
+  <img src="https://github.com/open-mmlab/mmaction2/raw/1.x/resources/spatio-temporal-det.gif" width="800px"/><br>
+    <p style="font-size:1.5vw;">AVA-2.1 上的时空动作检测</p>
+</div>
+
+## 主要特性
+
+- **模块设计**：MMAction2 将统一的视频理解框架解耦成不同的模块组件，通过组合不同的模块组件，用户可以便捷地构建自定义的视频理解模型
+
+- **支持多种任务和数据集**：MMAction2 支持多种视频理解任务，包括动作识别，时序动作检测，时空动作检测以及基于人体姿态的动作识别
+
+- **详尽的单元测试和文档**：MMAction2 提供了详尽的说明文档，API 接口说明，全面的单元测试，以供社区参考
+
+## 更新记录
+
+**v1.0.0rc3 版本 (2023.02.10)**:
+
+- 支持动作识别模型 UniFormer V1（ICLR'2022），UniFormer V2（Arxiv'2022）
+- 支持训练 MViT V2（CVPR'2022）和 MaskFeat（CVPR'2022）微调
+- 为 MMAction2 模型提供统一的推理接口实现视频分析任务的快速预测 ([demo](https://github.com/open-mmlab/mmaction2/blob/dev-1.x/demo/README.md#inferencer))
+
+## 安装
+
+MMAction2 依赖 [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine), [MMDetection](https://github.com/open-mmlab/mmdetection) (可选), [MMPose](https://github.com/open-mmlab/mmpose) (可选)，以下是安装的简要步骤。
+更详细的安装指南请参考 [install.md](https://mmaction2.readthedocs.io/zh_CN/1.x/get_started.html) 。
+
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate open-mmlab
+conda install pytorch torchvision -c pytorch  # 以上命令将自动安装最新版本的 PyTorch 和 cudatoolkit,请检查它们是否和你的环境匹配
+pip install -U openmim
+mim install mmengine 'mmcv>=2.0.0rc1'
+mim install "mmdet>=3.0.0rc5"  # 可选
+mim install "mmpose>=1.0.0rc0"  # 可选
+git clone https://github.com/open-mmlab/mmaction2.git
+cd mmaction2
+git checkout 1.x
+pip3 install -e .
+```
+
+## 模型库
+
+<table style="margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;">
+  <tr>
+    <td colspan="5" style="font-weight:bold;">行为识别方法</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/c3d/README.md">C3D</a> (CVPR'2014)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/tsn/README.md">TSN</a> (ECCV'2016)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/i3d/README.md">I3D</a> (CVPR'2017)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/c2d/README.md">C2D</a> (CVPR'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/i3d/README.md">I3D Non-Local</a> (CVPR'2018)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/r2plus1d/README.md">R(2+1)D</a> (CVPR'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/trn/README.md">TRN</a> (ECCV'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/tsm/README.md">TSM</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/tsm/README.md">TSM Non-Local</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/slowonly/README.md">SlowOnly</a> (ICCV'2019)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/slowfast/README.md">SlowFast</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/csn/README.md">CSN</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/tin/README.md">TIN</a> (AAAI'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/tpn/README.md">TPN</a> (CVPR'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/x3d/README.md">X3D</a> (CVPR'2020)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition_audio/resnet/README.md">MultiModality: Audio</a> (ArXiv'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/tanet/README.md">TANet</a> (ArXiv'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/timesformer/README.md">TimeSformer</a> (ICML'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/swin/README.md">VideoSwin</a> (CVPR'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/videomae/README.md">VideoMAE</a> (NeurIPS'2022)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/mvit/README.md">MViT V2</a> (CVPR'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/uniformer/README.md">UniFormer V1</a> (ICLR'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/recognition/uniformerv2/README.md">UniFormer V2</a> (Arxiv'2022)</td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">时序动作检测方法</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/localization/ssn/README.md">SSN</a> (ICCV'2017)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/localization/bsn/README.md">BSN</a> (ECCV'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/localization/bmn/README.md">BMN</a> (ICCV'2019)</td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">时空动作检测方法</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/detection/acrn/README.md">ACRN</a> (ECCV'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/detection/ava/README.md">SlowOnly+Fast R-CNN</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/detection/ava/README.md">SlowFast+Fast R-CNN</a> (ICCV'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/detection/lfb/README.md">LFB</a> (CVPR'2019)</td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="5" style="font-weight:bold;">基于骨骼点的动作识别方法</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/skeleton/stgcn/README.md">ST-GCN</a> (AAAI'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/skeleton/2s-agcn/README.md">2s-AGCN</a> (CVPR'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/skeleton/posec3d/README.md">PoseC3D</a> (CVPR'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/skeleton/stgcnpp/README.md">STGCN++</a> (ArXiv'2022)</td>
+    <td></td>
+  </tr>
+</table>
+
+各个模型的结果和设置都可以在对应的 config 目录下的 *README_zh-CN.md* 中查看。整体的概况也可也在 [**模型库**](https://mmaction2.readthedocs.io/zh_CN/1.x/modelzoo.html) 页面中查看。
+
+MMAction2 将跟进学界的最新进展，并支持更多算法和框架。如果您对 MMAction2 有任何功能需求，请随时在 [问题](https://github.com/open-mmlab/mmaction2/issues/19) 中留言。
+
+## 数据集
+
+<table style="margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;">
+  <tr>
+    <td colspan="4" style="font-weight:bold;">动作识别数据集</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/hmdb51/README.md">HMDB51</a> (<a href="https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/">Homepage</a>) (ICCV'2011)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/ucf101/README.md">UCF101</a> (<a href="https://www.crcv.ucf.edu/research/data-sets/ucf101/">Homepage</a>) (CRCV-IR-12-01)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/activitynet/README.md">ActivityNet</a> (<a href="http://activity-net.org/">Homepage</a>) (CVPR'2015)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/kinetics/README.md">Kinetics-[400/600/700]</a> (<a href="https://deepmind.com/research/open-source/kinetics/">Homepage</a>) (CVPR'2017)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/sthv1/README.md">SthV1</a>  (ICCV'2017)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/sthv2/README.md">SthV2</a> (<a href="https://developer.qualcomm.com/software/ai-datasets/something-something">Homepage</a>) (ICCV'2017)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/diving48/README.md">Diving48</a> (<a href="http://www.svcl.ucsd.edu/projects/resound/dataset.html">Homepage</a>) (ECCV'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/jester/README.md">Jester</a> (<a href="https://developer.qualcomm.com/software/ai-datasets/jester">Homepage</a>) (ICCV'2019)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/mit/README.md">Moments in Time</a> (<a href="http://moments.csail.mit.edu/">Homepage</a>) (TPAMI'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/mmit/README.md">Multi-Moments in Time</a> (<a href="http://moments.csail.mit.edu/challenge_iccv_2019.html">Homepage</a>) (ArXiv'2019)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/hvu/README.md">HVU</a> (<a href="https://github.com/holistic-video-understanding/HVU-Dataset">Homepage</a>) (ECCV'2020)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/omnisource/README.md">OmniSource</a> (<a href="https://kennymckormick.github.io/omnisource/">Homepage</a>) (ECCV'2020)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/gym/README.md">FineGYM</a> (<a href="https://sdolivia.github.io/FineGym/">Homepage</a>) (CVPR'2020)</td>
+    <td></td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="4" style="font-weight:bold;">时序动作检测数据集</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/thumos14/README.md">THUMOS14</a> (<a href="https://www.crcv.ucf.edu/THUMOS14/download.html">Homepage</a>) (THUMOS Challenge 2014)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/activitynet/README.md">ActivityNet</a> (<a href="http://activity-net.org/">Homepage</a>) (CVPR'2015)</td>
+    <td></td>
+    <td></td>
+  </tr>
+  <tr>
+    <td colspan="4" style="font-weight:bold;">时空动作检测数据集</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/ucf101_24/README.md">UCF101-24*</a> (<a href="http://www.thumos.info/download.html">Homepage</a>) (CRCV-IR-12-01)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/jhmdb/README.md">JHMDB*</a> (<a href="http://jhmdb.is.tue.mpg.de/">Homepage</a>) (ICCV'2015)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/ava/README.md">AVA</a> (<a href="https://research.google.com/ava/index.html">Homepage</a>) (CVPR'2018)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/ava_kinetics/README.md">AVA-Kinetics</a> (<a href="https://research.google.com/ava/index.html">Homepage</a>) (Arxiv'2020)</td>
+  </tr>
+  <tr>
+    <td colspan="4" style="font-weight:bold;">基于骨骼点的动作识别数据集</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md">PoseC3D-FineGYM</a> (<a href="https://kennymckormick.github.io/posec3d/">Homepage</a>) (ArXiv'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md">PoseC3D-NTURGB+D</a> (<a href="https://kennymckormick.github.io/posec3d/">Homepage</a>) (ArXiv'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md">PoseC3D-UCF101</a> (<a href="https://kennymckormick.github.io/posec3d/">Homepage</a>) (ArXiv'2021)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md">PoseC3D-HMDB51</a> (<a href="https://kennymckormick.github.io/posec3d/">Homepage</a>) (ArXiv'2021)</td>
+  </tr>
+</table>
+
+标记 * 代表对应数据集并未被完全支持，但提供相应的数据准备步骤。整体的概况也可也在 [**数据集**](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 页面中查看。
+
+## 数据集准备
+
+请参考 [数据准备](https://mmaction2.readthedocs.io/en/1.x/user_guides/2_data_prepare.html) 了解数据集准备概况。所有支持的数据集都列于 [数据集清单](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 中。
+
+## FAQ
+
+请参考 [FAQ](docs/zh_cn/notes/faq.md) 了解其他用户的常见问题。
+
+## 相关工作
+
+目前有许多研究工作或工程项目基于 MMAction2 搭建，例如：
+
+- Video Swin Transformer. [\[论文\]](https://arxiv.org/abs/2106.13230)[\[代码\]](https://github.com/SwinTransformer/Video-Swin-Transformer)
+- Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 **Oral**. [\[论文\]](https://arxiv.org/abs/2107.10161)[\[代码\]](https://github.com/Cogito2012/DEAR)
+- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 **Oral**. [\[论文\]](https://arxiv.org/abs/2103.17263)[\[代码\]](https://github.com/xvjiarui/VFS)
+
+更多详情可见 [相关工作](docs/en/notes/projects.md) 。
+
+## 许可
+
+该项目开源自 [Apache 2.0 license](LICENSE).
+
+## 引用
+
+如果你觉得 MMAction2 对你的研究有所帮助，可以考虑引用它：
+
+```BibTeX
+@misc{2020mmaction2,
+    title={OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark},
+    author={MMAction2 Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmaction2}},
+    year={2020}
+}
+```
+
+## 参与贡献
+
+我们非常欢迎用户对于 MMAction2 做出的任何贡献，可以参考 [贡献指南](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING_zh-CN.md) 文件了解更多细节。
+
+## 致谢
+
+MMAction2 是一款由不同学校和公司共同贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者，以及提供宝贵反馈的用户。
+我们希望该工具箱和基准测试可以为社区提供灵活的代码工具，供用户复现现有算法并开发自己的新模型，从而不断为开源社区提供贡献。
+
+## OpenMMLab 的其他项目
+
+- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱和基准测试
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.

From 56e237df21e38aa0174267dfdf213e9e14b123ac Mon Sep 17 00:00:00 2001
From: wxDai <daiwenxun@pjlab.org.cn>
Date: Wed, 22 Mar 2023 15:40:56 +0800
Subject: [PATCH 17/36] [Update] update detection related folders (#2262)

---
 .../detection/_base_/models/slowonly_r50.py   |  54 -----
 .../_base_/models/slowonly_r50_nl.py          |  51 ----
 configs/detection/acrn/README.md              |  42 +---
 configs/detection/acrn/metafile.yml           |  18 +-
 ...ned-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py |  69 ++++--
 ...ned-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py | 109 ++++++++-
 configs/detection/ava/README.md               | 125 ----------
 configs/detection/ava/metafile.yml            | 227 ------------------
 ...pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py |  50 ----
 ...etrained-r101_8xb16-8x8x1-20e_ava21-rgb.py |  72 ------
 ...ained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py |  16 --
 ...rained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py |  74 ------
 ...etrained-r50_8xb16-4x16x1-20e_ava21-rgb.py |   9 -
 configs/detection/ava_kinetics/README.md      | 103 --------
 configs/detection/slowfast/README.md          |  96 ++++++++
 configs/detection/slowfast/metafile.yml       | 121 ++++++++++
 ...-r50-context_8xb16-4x16x1-20e_ava21-rgb.py |   0
 ...ral-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py |   0
 ...etrained-r50_8xb16-4x16x1-20e_ava21-rgb.py |  90 ++++++-
 ...ned-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py |  86 ++++++-
 ...pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py | 171 +++++++++++++
 ...-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py |   0
 configs/detection/slowonly/README.md          | 126 ++++++++++
 configs/detection/slowonly/metafile.yml       | 102 ++++++++
 ...re-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py |  65 ++++-
 ...pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py |  60 ++++-
 ...nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py |   0
 ...d_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py |   0
 ...ral-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py |   0
 ...context_8xb8-8x8x1-10e_ava-kinetics-rgb.py |   0
 ...8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py |  17 +-
 ...re-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py |   0
 ...pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py |   0
 ...etrained-r101_8xb16-8x8x1-20e_ava21-rgb.py | 151 ++++++++++++
 ...ained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py | 160 ++++++++++++
 ...rained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py | 159 ++++++++++++
 ...etrained-r50_8xb16-4x16x1-20e_ava21-rgb.py |  69 +++++-
 ...etrained-r50_8xb16-4x16x1-20e_ava21-rgb.py | 153 ++++++++++++
 .../models/backbones/resnet3d_slowonly.py     |  10 -
 model-index.yml                               |   3 +-
 40 files changed, 1746 insertions(+), 912 deletions(-)
 delete mode 100644 configs/detection/_base_/models/slowonly_r50.py
 delete mode 100644 configs/detection/_base_/models/slowonly_r50_nl.py
 delete mode 100644 configs/detection/ava/README.md
 delete mode 100644 configs/detection/ava/metafile.yml
 delete mode 100644 configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
 delete mode 100644 configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
 delete mode 100644 configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
 delete mode 100644 configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
 delete mode 100644 configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
 delete mode 100644 configs/detection/ava_kinetics/README.md
 create mode 100644 configs/detection/slowfast/README.md
 create mode 100644 configs/detection/slowfast/metafile.yml
 rename configs/detection/{ava => slowfast}/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py (100%)
 rename configs/detection/{ava => slowfast}/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py (100%)
 rename configs/detection/{ava => slowfast}/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py (52%)
 rename configs/detection/{ava => slowfast}/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py (52%)
 create mode 100644 configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
 rename configs/detection/{ava => slowfast}/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py (100%)
 create mode 100644 configs/detection/slowonly/README.md
 create mode 100644 configs/detection/slowonly/metafile.yml
 rename configs/detection/{ava_kinetics => slowonly}/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py (65%)
 rename configs/detection/{ava_kinetics => slowonly}/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py (70%)
 rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py (100%)
 rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py (100%)
 rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py (100%)
 rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py (100%)
 rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py (85%)
 rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py (100%)
 rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py (100%)
 create mode 100644 configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
 create mode 100644 configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
 create mode 100644 configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
 rename configs/detection/{ava => slowonly}/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py (56%)
 create mode 100644 configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py

diff --git a/configs/detection/_base_/models/slowonly_r50.py b/configs/detection/_base_/models/slowonly_r50.py
deleted file mode 100644
index 4a06a4ab53..0000000000
--- a/configs/detection/_base_/models/slowonly_r50.py
+++ /dev/null
@@ -1,54 +0,0 @@
-url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
-       'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
-       'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
-       'kinetics400-rgb_20220901-e7b65fad.pth')
-
-model = dict(
-    type='FastRCNN',
-    _scope_='mmdet',
-    init_cfg=dict(type='Pretrained', checkpoint=url),
-    backbone=dict(
-        type='ResNet3dSlowOnly',
-        depth=50,
-        pretrained=None,
-        pretrained2d=False,
-        lateral=False,
-        num_stages=4,
-        conv1_kernel=(1, 7, 7),
-        conv1_stride_t=1,
-        pool1_stride_t=1,
-        spatial_strides=(1, 2, 2, 1)),
-    roi_head=dict(
-        type='AVARoIHead',
-        bbox_roi_extractor=dict(
-            type='SingleRoIExtractor3D',
-            roi_layer_type='RoIAlign',
-            output_size=8,
-            with_temporal_pool=True),
-        bbox_head=dict(
-            type='BBoxHeadAVA',
-            in_channels=2048,
-            num_classes=81,
-            multilabel=True,
-            dropout_ratio=0.5)),
-    data_preprocessor=dict(
-        type='ActionDataPreprocessor',
-        _scope_='mmaction',
-        mean=[123.675, 116.28, 103.53],
-        std=[58.395, 57.12, 57.375],
-        format_shape='NCTHW'),
-    train_cfg=dict(
-        rcnn=dict(
-            assigner=dict(
-                type='MaxIoUAssignerAVA',
-                pos_iou_thr=0.9,
-                neg_iou_thr=0.9,
-                min_pos_iou=0.9),
-            sampler=dict(
-                type='RandomSampler',
-                num=32,
-                pos_fraction=1,
-                neg_pos_ub=-1,
-                add_gt_as_proposals=True),
-            pos_weight=1.0)),
-    test_cfg=dict(rcnn=None))
diff --git a/configs/detection/_base_/models/slowonly_r50_nl.py b/configs/detection/_base_/models/slowonly_r50_nl.py
deleted file mode 100644
index 6dcdc30bfc..0000000000
--- a/configs/detection/_base_/models/slowonly_r50_nl.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# model setting
-model = dict(
-    type='mmdet.FastRCNN',
-    backbone=dict(
-        type='ResNet3dSlowOnly',
-        depth=50,
-        pretrained=None,
-        pretrained2d=False,
-        lateral=False,
-        num_stages=4,
-        conv1_kernel=(1, 7, 7),
-        conv1_stride_t=1,
-        pool1_stride_t=1,
-        spatial_strides=(1, 2, 2, 1),
-        norm_cfg=dict(type='BN3d', requires_grad=True),
-        non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
-        non_local_cfg=dict(
-            sub_sample=True,
-            use_scale=True,
-            norm_cfg=dict(type='BN3d', requires_grad=True),
-            mode='embedded_gaussian')),
-    roi_head=dict(
-        type='AVARoIHead',
-        bbox_roi_extractor=dict(
-            type='SingleRoIExtractor3D',
-            roi_layer_type='RoIAlign',
-            output_size=8,
-            with_temporal_pool=True),
-        bbox_head=dict(
-            type='BBoxHeadAVA',
-            in_channels=2048,
-            num_classes=81,
-            multilabel=True,
-            dropout_ratio=0.5)),
-    train_cfg=dict(
-        rcnn=dict(
-            assigner=dict(
-                type='MaxIoUAssignerAVA',
-                pos_iou_thr=0.9,
-                neg_iou_thr=0.9,
-                min_pos_iou=0.9,
-                iou_calculator=dict(type='mmdet.BboxOverlaps2D')),
-            sampler=dict(
-                type='mmdet.RandomSampler',
-                num=32,
-                pos_fraction=1,
-                neg_pos_ub=-1,
-                add_gt_as_proposals=True),
-            pos_weight=1.0,
-            debug=False)),
-    test_cfg=dict(rcnn=None))
diff --git a/configs/detection/acrn/README.md b/configs/detection/acrn/README.md
index a9af00da0c..d08efb6d2d 100644
--- a/configs/detection/acrn/README.md
+++ b/configs/detection/acrn/README.md
@@ -20,23 +20,19 @@ Current state-of-the-art approaches for spatio-temporal action localization rely
 
 ### AVA2.1
 
-| frame sampling strategy | resolution | gpus |     backbone      |   pretrain   |  mAP  | gpu_mem(M) |                  config                   |                  ckpt                   |                   log                   |
-| :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :--------: | :---------------------------------------: | :-------------------------------------: | :-------------------------------------: |
-|          8x8x1          |    raw     |  8   | SlowFast ResNet50 | Kinetics-400 | 27.58 |   15263    | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb_20220906-0dae1a90.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.log) |
+| frame sampling strategy | gpus |     backbone      |   pretrain   |  mAP  |                      config                      |                      ckpt                      |                      log                      |
+| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: |
+|          8x8x1          |  8   | SlowFast ResNet50 | Kinetics-400 | 27.65 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb_20220906-0dae1a90.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.log) |
 
 ### AVA2.2
 
-| frame sampling strategy | resolution | gpus |     backbone      |   pretrain   |  mAP  | gpu_mem(M) |                  config                   |                  ckpt                   |                   log                   |
-| :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :--------: | :---------------------------------------: | :-------------------------------------: | :-------------------------------------: |
-|          8x8x1          |    raw     |  8   | SlowFast ResNet50 | Kinetics-400 | 27.63 |   15263    | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb_20220906-66ec24a2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.log) |
+| frame sampling strategy | gpus |     backbone      |   pretrain   |  mAP  |                      config                      |                      ckpt                      |                      log                      |
+| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: |
+|          8x8x1          |  8   | SlowFast ResNet50 | Kinetics-400 | 27.71 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb_20220906-66ec24a2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.log) |
 
-Note:
+1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
 
-1. The **gpus** indicates the number of gpu we used to get the checkpoint.
-   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
-   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
-
-For more details on data preparation, you can refer to to [AVA Data Preparation](/tools/data/ava/README.md).
+For more details on data preparation, you can refer to [AVA](/tools/data/ava/README.md).
 
 ## Train
 
@@ -46,14 +42,14 @@ You can use the following command to train a model.
 python tools/train.py ${CONFIG_FILE} [optional arguments]
 ```
 
-Example: train ACRN with SlowFast backbone on AVA in a deterministic option.
+Example: train ACRN with SlowFast backbone on AVA2.1 in a deterministic option with periodic validation.
 
 ```shell
 python tools/train.py configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py \
-    --cfg-options randomness.seed=0 randomness.deterministic=True
+    --seed 0 --deterministic
 ```
 
-For more details and optional arguments infos, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
 
 ## Test
 
@@ -63,29 +59,17 @@ You can use the following command to test a model.
 python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
 ```
 
-Example: test ACRN with SlowFast backbone on AVA and dump the result to a pkl file.
+Example: test ACRN with SlowFast backbone on AVA2.1 and dump the result to a pkl file.
 
 ```shell
 python tools/test.py configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py \
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details and optional arguments infos, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
 
 ## Citation
 
-<!-- [DATASET] -->
-
-```BibTeX
-@inproceedings{gu2018ava,
-  title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
-  author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
-  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
-  pages={6047--6056},
-  year={2018}
-}
-```
-
 ```BibTeX
 @inproceedings{sun2018actor,
   title={Actor-centric relation network},
diff --git a/configs/detection/acrn/metafile.yml b/configs/detection/acrn/metafile.yml
index 3212cb7dc8..9db11da474 100644
--- a/configs/detection/acrn/metafile.yml
+++ b/configs/detection/acrn/metafile.yml
@@ -1,9 +1,9 @@
 Collections:
-- Name: ACRN
-  README: configs/detection/acrn/README.md
-  Paper:
-    URL: https://arxiv.org/abs/1807.10982
-    Title: "Actor-Centric Relation Network"
+  - Name: ACRN
+    README: configs/detection/acrn/README.md
+    Paper:
+      URL: https://arxiv.org/abs/1807.10982
+      Title: "Actor-Centric Relation Network"
 
 Models:
   - Name: slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb
@@ -14,7 +14,6 @@ Models:
       Batch Size: 8
       Epochs: 10
       Pretrained: Kinetics-400
-      Resolution: short-side 320
       Training Data: AVA v2.1
       Training Resources: 8 GPUs
     Modality: RGB
@@ -22,7 +21,7 @@ Models:
       - Dataset: AVA v2.1
         Task: Action Detection
         Metrics:
-              mAP: 27.58
+              mAP: 27.65
     Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb_20220906-0dae1a90.pth
 
@@ -34,14 +33,13 @@ Models:
       Batch Size: 8
       Epochs: 10
       Pretrained: Kinetics-400
-      Resolution: short-side 320
       Training Data: AVA v2.2
       Training Resources: 8 GPUs
     Modality: RGB
     Results:
-      - Dataset: AVA v2.1
+      - Dataset: AVA v2.2
         Task: Action Detection
         Metrics:
-              mAP: 27.63
+              mAP: 27.71
     Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb_20220906-66ec24a2.pth
diff --git a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py
index 641364bcce..10928a96ee 100644
--- a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py
+++ b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py
@@ -1,16 +1,16 @@
-_base_ = [
-    '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/'
+       'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
+       'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')
 
 model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
     backbone=dict(
-        _delete_=True,
-        type='ResNet3dSlowFast',
-        _scope_='mmaction',
-        pretrained=(
-            'https://download.openmmlab.com/mmaction/recognition/slowfast/'
-            'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
-            'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth'),
+        type='mmaction.ResNet3dSlowFast',
+        pretrained=None,
         resample_rate=4,
         speed_ratio=4,
         channel_ratio=8,
@@ -37,17 +37,44 @@
             pool1_stride_t=1,
             spatial_strides=(1, 2, 2, 1))),
     roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
         shared_head=dict(type='ACRNHead', in_channels=4608, out_channels=2304),
-        bbox_head=dict(in_channels=2304)))
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2304,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
 
 dataset_type = 'AVADataset'
 data_root = 'data/ava/rawframes'
 anno_root = 'data/ava/annotations'
 
-proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
-                       'recall_93.9.pkl')
-proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
-
 ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
 ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
 
@@ -56,9 +83,17 @@
 
 label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
 
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+file_client_args = dict(
+    io_backend='petrel',
+    path_mapping=dict({'data/ava': 's254:s3://openmmlab/datasets/action/ava'}))
 train_pipeline = [
     dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='RandomRescale', scale_range=(256, 320)),
     dict(type='RandomCrop', size=256),
     dict(type='Flip', flip_ratio=0.5),
@@ -69,7 +104,7 @@
 val_pipeline = [
     dict(
         type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='FormatShape', input_format='NCTHW', collapse=True),
     dict(type='PackActionInputs')
diff --git a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py
index 02992c654a..4537d25cc7 100644
--- a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py
+++ b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py
@@ -1,5 +1,75 @@
-_base_ = [('slowfast-acrn_kinetics400-pretrained-r50'
-           '_8xb8-8x8x1-cosine-10e_ava21-rgb.py')]
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/'
+       'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
+       'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')
+
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowFast',
+        pretrained=None,
+        resample_rate=4,
+        speed_ratio=4,
+        channel_ratio=8,
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=True,
+            fusion_kernel=7,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1),
+            spatial_strides=(1, 2, 2, 1)),
+        fast_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=False,
+            base_channels=8,
+            conv1_kernel=(5, 7, 7),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            spatial_strides=(1, 2, 2, 1))),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        shared_head=dict(type='ACRNHead', in_channels=4608, out_channels=2304),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2304,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
 
 dataset_type = 'AVADataset'
 data_root = 'data/ava/rawframes'
@@ -17,9 +87,13 @@
                        'recall_93.9.pkl')
 proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
 
+file_client_args = dict(io_backend='disk')
+file_client_args = dict(
+    io_backend='petrel',
+    path_mapping=dict({'data/ava': 's254:s3://openmmlab/datasets/action/ava'}))
 train_pipeline = [
     dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='RandomRescale', scale_range=(256, 320)),
     dict(type='RandomCrop', size=256),
     dict(type='Flip', flip_ratio=0.5),
@@ -30,7 +104,7 @@
 val_pipeline = [
     dict(
         type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='FormatShape', input_format='NCTHW', collapse=True),
     dict(type='PackActionInputs')
@@ -71,3 +145,30 @@
     label_file=label_file,
     exclude_file=exclude_file_val)
 test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=2,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=8,
+        eta_min=0,
+        by_epoch=True,
+        begin=2,
+        end=10,
+        convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001),
+    clip_grad=dict(max_norm=40, norm_type=2))
diff --git a/configs/detection/ava/README.md b/configs/detection/ava/README.md
deleted file mode 100644
index 1f6354641b..0000000000
--- a/configs/detection/ava/README.md
+++ /dev/null
@@ -1,125 +0,0 @@
-# AVA
-
-[Ava: A video dataset of spatio-temporally localized atomic visual actions](https://openaccess.thecvf.com/content_cvpr_2018/html/Gu_AVA_A_Video_CVPR_2018_paper.html)
-
-<!-- [ALGORITHM] -->
-
-<div align="center">
-  <img src="https://github.com/open-mmlab/mmaction2/raw/master/resources/spatio-temporal-det.gif" width="800px"/>
-</div>
-
-## Abstract
-
-<!-- [ABSTRACT] -->
-
-This paper introduces a video dataset of spatio-temporally localized Atomic Visual Actions (AVA). The AVA dataset densely annotates 80 atomic visual actions in 430 15-minute video clips, where actions are localized in space and time, resulting in 1.58M action labels with multiple labels per person occurring frequently. The key characteristics of our dataset are: (1) the definition of atomic visual actions, rather than composite actions; (2) precise spatio-temporal annotations with possibly multiple annotations for each person; (3) exhaustive annotation of these atomic actions over 15-minute video clips; (4) people temporally linked across consecutive segments; and (5) using movies to gather a varied set of action representations. This departs from existing datasets for spatio-temporal action recognition, which typically provide sparse annotations for composite actions in short video clips. We will release the dataset publicly.
-AVA, with its realistic scene and action complexity, exposes the intrinsic difficulty of action recognition. To benchmark this, we present a novel approach for action localization that builds upon the current state-of-the-art methods, and demonstrates better performance on JHMDB and UCF101-24 categories. While setting a new state of the art on existing datasets, the overall results on AVA are low at 15.6% mAP, underscoring the need for developing new approaches for video understanding.
-
-<!-- [IMAGE] -->
-
-<div align=center>
-<img src="https://user-images.githubusercontent.com/34324155/143015933-36eb7abd-d38f-4be6-a327-4d34c6f4edc1.png" width="800"/>
-</div>
-
-<!-- [ALGORITHM] -->
-
-```BibTeX
-@inproceedings{feichtenhofer2019slowfast,
-  title={Slowfast networks for video recognition},
-  author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming},
-  booktitle={Proceedings of the IEEE international conference on computer vision},
-  pages={6202--6211},
-  year={2019}
-}
-```
-
-## Results and Models
-
-### AVA2.1
-
-| frame sampling strategy | resolution | gpus |               backbone               |   pretrain   |  mAP  | gpu_mem(M) |               config                |               ckpt                |               log                |
-| :---------------------: | :--------: | :--: | :----------------------------------: | :----------: | :---: | :--------: | :---------------------------------: | :-------------------------------: | :------------------------------: |
-|         4x16x1          |    raw     |  8   |          SlowOnly ResNet50           | Kinetics-400 | 20.76 |    8503    | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
-|         4x16x1          |    raw     |  8   |          SlowOnly ResNet50           | Kinetics-700 | 22.77 |    8503    | [config](/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
-|         4x16x1          |    raw     |  8   | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 21.49 |   11870    | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log) |
-|          8x8x1          |    raw     |  8   | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 23.74 |   25375    | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log) |
-|          8x8x1          |    raw     |  8   |          SlowOnly ResNet101          | Kinetics-400 | 24.82 |   23477    | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log) |
-|         4x16x1          |    raw     |  8   |          SlowFast ResNet50           | Kinetics-400 | 24.27 |   18616    | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
-|         4x16x1          |    raw     |  8   |   SlowFast ResNet50 (with context)   | Kinetics-400 | 25.25 |   18616    | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log) |
-|          8x8x1          |    raw     |  8   |          SlowFast ResNet50           | Kinetics-400 | 25.73 |   13802    | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log) |
-
-### AVA2.2
-
-| frame sampling strategy | resolution | gpus |               backbone               |   pretrain   |  mAP  | gpu_mem(M) |               config                |               ckpt                |               log                |
-| :---------------------: | :--------: | :--: | :----------------------------------: | :----------: | :---: | :--------: | :---------------------------------: | :-------------------------------: | :------------------------------: |
-|          8x8x1          |    raw     |  8   |          SlowFast ResNet50           | Kinetics-400 | 25.82 |   10484    | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
-|          8x8x1          |    raw     |  8   |   SlowFast ResNet50 (temporal-max)   | Kinetics-400 | 26.32 |   10484    | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
-|          8x8x1          |    raw     |  8   | SlowFast ResNet50 (temporal-max, focal loss) | Kinetics-400 | 26.58 |   10484    | [config](/configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
-
-Note:
-
-1. The **gpus** indicates the number of gpu we used to get the checkpoint.
-   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
-   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
-2. **With context** indicates that using both RoI feature and global pooled feature for classification, which leads to around 1% mAP improvement in general.
-
-:::
-
-For more details on data preparation, you can refer to [AVA Data Preparation](/tools/data/ava/README.md).
-
-## Train
-
-You can use the following command to train a model.
-
-```shell
-python tools/train.py ${CONFIG_FILE} [optional arguments]
-```
-
-Example: train the SlowOnly model on AVA in a deterministic option.
-
-```shell
-python tools/train.py configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
-    --cfg-options randomness.seed=0 randomness.deterministic=True
-```
-
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
-
-## Test
-
-You can use the following command to test a model.
-
-```shell
-python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
-```
-
-Example: test the SlowOnly model on AVA and dump the result to a pkl file.
-
-```shell
-python tools/test.py configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
-    checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
-```
-
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
-
-## Citation
-
-<!-- [DATASET] -->
-
-```BibTeX
-@inproceedings{gu2018ava,
-  title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
-  author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
-  booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
-  pages={6047--6056},
-  year={2018}
-}
-```
-
-```BibTeX
-@article{duan2020omni,
-  title={Omni-sourced Webly-supervised Learning for Video Recognition},
-  author={Duan, Haodong and Zhao, Yue and Xiong, Yuanjun and Liu, Wentao and Lin, Dahua},
-  journal={arXiv preprint arXiv:2003.13042},
-  year={2020}
-}
-```
diff --git a/configs/detection/ava/metafile.yml b/configs/detection/ava/metafile.yml
deleted file mode 100644
index ec745ad5c4..0000000000
--- a/configs/detection/ava/metafile.yml
+++ /dev/null
@@ -1,227 +0,0 @@
-Collections:
-- Name: AVA
-  README: configs/detection/ava/README.md
-  Paper:
-    URL: https://arxiv.org/abs/1705.08421
-    Title: "AVA: A Video Dataset of Spatio-temporally Localized Atomic Visual Actions"
-
-Models:
-  - Name: slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
-    Config: configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
-    In Collection: AVA
-    Metadata:
-      Architecture: ResNet50
-      Batch Size: 16
-      Epochs: 20
-      Pretrained: Kinetics-400
-      Resolution: short-side 320
-      Training Data: AVA v2.1
-      Training Resources: 8 GPUs
-    Modality: RGB
-    Results:
-      - Dataset: AVA v2.1
-        Task: Action Detection
-        Metrics:
-              mAP: 20.76
-    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth
-
-  - Name: slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
-    Config: configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
-    In Collection: AVA
-    Metadata:
-      Architecture: ResNet50
-      Batch Size: 16
-      Epochs: 20
-      Pretrained: Kinetics-700
-      Resolution: short-side 320
-      Training Data: AVA v2.1
-      Training Resources: 8 GPUs
-    Modality: RGB
-    Results:
-      - Dataset: AVA v2.1
-        Task: Action Detection
-        Metrics:
-              mAP: 22.77
-    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth
-
-  - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb
-    Config: configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
-    In Collection: AVA
-    Metadata:
-      Architecture: ResNet50
-      Batch Size: 16
-      Epochs: 20
-      Pretrained: Kinetics-400
-      Resolution: short-side 320
-      Training Data: AVA v2.1
-      Training Resources: 8 GPUs
-    Modality: RGB
-    Results:
-      - Dataset: AVA v2.1
-        Task: Action Detection
-        Metrics:
-              mAP: 21.49
-    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth
-
-  - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb
-    Config: configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
-    In Collection: AVA
-    Metadata:
-      Architecture: ResNet50
-      Batch Size: 16
-      Epochs: 20
-      Pretrained: Kinetics-400
-      Resolution: short-side 320
-      Training Data: AVA v2.1
-      Training Resources: 8 GPUs
-    Modality: RGB
-    Results:
-      - Dataset: AVA v2.1
-        Task: Action Detection
-        Metrics:
-              mAP: 23.47
-    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth
-
-  - Name: slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb
-    Config: configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
-    In Collection: AVA
-    Metadata:
-      Architecture: ResNet101
-      Batch Size: 16
-      Epochs: 20
-      Pretrained: Kinetics-400
-      Resolution: short-side 320
-      Training Data: AVA v2.1
-      Training Resources: 8 GPUs
-    Modality: RGB
-    Results:
-      - Dataset: AVA v2.1
-        Task: Action Detection
-        Metrics:
-              mAP: 24.82
-    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth
-
-  - Name: slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
-    Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
-    In Collection: AVA
-    Metadata:
-      Architecture: ResNet50
-      Batch Size: 16
-      Epochs: 20
-      Pretrained: Kinetics-400
-      Resolution: short-side 320
-      Training Data: AVA v2.1
-      Training Resources: 8 GPUs
-    Modality: RGB
-    Results:
-      - Dataset: AVA v2.1
-        Task: Action Detection
-        Metrics:
-              mAP: 24.27
-    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth
-
-  - Name: slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb
-    Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py
-    In Collection: AVA
-    Metadata:
-      Architecture: ResNet50
-      Batch Size: 16
-      Epochs: 20
-      Pretrained: Kinetics-400
-      Resolution: short-side 320
-      Training Data: AVA v2.1
-      Training Resources: 8 GPUs
-    Modality: RGB
-    Results:
-      - Dataset: AVA v2.1
-        Task: Action Detection
-        Metrics:
-              mAP: 25.25
-    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth
-
-  - Name: slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb
-    Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
-    In Collection: AVA
-    Metadata:
-      Architecture: ResNet50
-      Batch Size: 8
-      Epochs: 20
-      Pretrained: Kinetics-400
-      Resolution: short-side 320
-      Training Data: AVA v2.1
-      Training Resources: 8 GPUs
-    Modality: RGB
-    Results:
-      - Dataset: AVA v2.1
-        Task: Action Detection
-        Metrics:
-              mAP: 25.73
-    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth
-
-  - Name: slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb
-    Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
-    In Collection: AVA
-    Metadata:
-      Architecture: ResNet50
-      Batch Size: 6
-      Epochs: 20
-      Pretrained: Kinetics-400
-      Resolution: short-side 320
-      Training Data: AVA v2.2
-      Training Resources: 8 GPUs
-    Modality: RGB
-    Results:
-      - Dataset: AVA v2.2
-        Task: Action Detection
-        Metrics:
-              mAP: 25.98
-    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth
-
-  - Name: slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb
-    Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py
-    In Collection: AVA
-    Metadata:
-      Architecture: ResNet50
-      Batch Size: 6
-      Epochs: 20
-      Pretrained: Kinetics-400
-      Resolution: short-side 320
-      Training Data: AVA v2.2
-      Training Resources: 8 GPUs
-    Modality: RGB
-    Results:
-      - Dataset: AVA v2.2
-        Task: Action Detection
-        Metrics:
-              mAP: 26.38
-    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth
-
-  - Name: slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb
-    Config: configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py
-    In Collection: AVA
-    Metadata:
-      Architecture: ResNet50
-      Batch Size: 6
-      Epochs: 20
-      Pretrained: Kinetics-400
-      Resolution: short-side 320
-      Training Data: AVA v2.2
-      Training Resources: 8 GPUs
-    Modality: RGB
-    Results:
-      - Dataset: AVA v2.2
-        Task: Action Detection
-        Metrics:
-              mAP: 26.59
-    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth
diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py b/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
deleted file mode 100644
index 97e0197a6e..0000000000
--- a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
+++ /dev/null
@@ -1,50 +0,0 @@
-_base_ = ['slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py']
-
-model = dict(
-    backbone=dict(
-        resample_rate=4,
-        speed_ratio=4,
-        slow_pathway=dict(fusion_kernel=7),
-        pretrained=(
-            'https://download.openmmlab.com/mmaction/recognition/slowfast/'
-            'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
-            'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')))
-
-dataset_type = 'AVADataset'
-data_root = 'data/ava/rawframes'
-anno_root = 'data/ava/annotations'
-
-ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
-exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
-label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
-
-proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
-                       'recall_93.9.pkl')
-
-train_pipeline = [
-    dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
-    dict(type='RawFrameDecode'),
-    dict(type='RandomRescale', scale_range=(256, 320)),
-    dict(type='RandomCrop', size=256),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCTHW', collapse=True),
-    dict(type='PackActionInputs')
-]
-
-train_dataloader = dict(
-    batch_size=8,
-    num_workers=8,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        exclude_file=exclude_file_train,
-        pipeline=train_pipeline,
-        label_file=label_file,
-        proposal_file=proposal_file_train,
-        data_prefix=dict(img=data_root)))
-
-optim_wrapper = dict(
-    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001),
-    clip_grad=dict(max_norm=40, norm_type=2))
diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
deleted file mode 100644
index 815e61c2fc..0000000000
--- a/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
+++ /dev/null
@@ -1,72 +0,0 @@
-_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py']
-
-model = dict(
-    backbone=dict(
-        depth=101,
-        pretrained=(
-            'https://download.openmmlab.com/mmaction/recognition/slowonly/'
-            'omni/slowonly_r101_without_omni_8x8x1_kinetics400_rgb_'
-            '20200926-0c730aef.pth')))
-
-dataset_type = 'AVADataset'
-data_root = 'data/ava/rawframes'
-anno_root = 'data/ava/annotations'
-
-ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
-ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
-
-exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
-exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
-
-label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
-
-proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
-                       'recall_93.9.pkl')
-proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
-
-train_pipeline = [
-    dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
-    dict(type='RawFrameDecode'),
-    dict(type='RandomRescale', scale_range=(256, 320)),
-    dict(type='RandomCrop', size=256),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCTHW', collapse=True),
-    dict(type='PackActionInputs')
-]
-# The testing is w/o. any cropping / flipping
-val_pipeline = [
-    dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True),
-    dict(type='RawFrameDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='FormatShape', input_format='NCTHW', collapse=True),
-    dict(type='PackActionInputs')
-]
-
-train_dataloader = dict(
-    batch_size=16,
-    num_workers=8,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        exclude_file=exclude_file_train,
-        pipeline=train_pipeline,
-        label_file=label_file,
-        proposal_file=proposal_file_train,
-        data_prefix=dict(img=data_root)))
-val_dataloader = dict(
-    batch_size=1,
-    num_workers=8,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        exclude_file=exclude_file_val,
-        pipeline=val_pipeline,
-        label_file=label_file,
-        proposal_file=proposal_file_val,
-        data_prefix=dict(img=data_root),
-        test_mode=True))
-test_dataloader = val_dataloader
diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
deleted file mode 100644
index 43b0fa1a28..0000000000
--- a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
+++ /dev/null
@@ -1,16 +0,0 @@
-_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py']
-
-model = dict(
-    backbone=dict(
-        pretrained=(
-            'https://download.openmmlab.com/mmaction/recognition/slowonly/'
-            'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb/'
-            'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb_'
-            '20210308-0d6e5a69.pth'),
-        norm_cfg=dict(type='BN3d', requires_grad=True),
-        non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
-        non_local_cfg=dict(
-            sub_sample=True,
-            use_scale=True,
-            norm_cfg=dict(type='BN3d', requires_grad=True),
-            mode='embedded_gaussian')))
diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
deleted file mode 100644
index a962f10c11..0000000000
--- a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
+++ /dev/null
@@ -1,74 +0,0 @@
-_base_ = [
-    'slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py'
-]
-
-model = dict(
-    backbone=dict(
-        pretrained=(
-            'https://download.openmmlab.com/mmaction/recognition/slowonly/'
-            'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb/'
-            'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb_'
-            '20210308-e8dd9e82.pth')))
-
-dataset_type = 'AVADataset'
-data_root = 'data/ava/rawframes'
-anno_root = 'data/ava/annotations'
-
-ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
-ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
-
-exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
-exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
-
-label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
-
-proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
-                       'recall_93.9.pkl')
-proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
-
-train_pipeline = [
-    dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
-    dict(type='RawFrameDecode'),
-    dict(type='RandomRescale', scale_range=(256, 320)),
-    dict(type='RandomCrop', size=256),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCTHW', collapse=True),
-    dict(type='PackActionInputs')
-]
-# The testing is w/o. any cropping / flipping
-val_pipeline = [
-    dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True),
-    dict(type='RawFrameDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='FormatShape', input_format='NCTHW', collapse=True),
-    dict(type='PackActionInputs')
-]
-
-train_dataloader = dict(
-    batch_size=16,
-    num_workers=8,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
-    dataset=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        exclude_file=exclude_file_train,
-        pipeline=train_pipeline,
-        label_file=label_file,
-        proposal_file=proposal_file_train,
-        data_prefix=dict(img=data_root)))
-val_dataloader = dict(
-    batch_size=1,
-    num_workers=8,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        exclude_file=exclude_file_val,
-        pipeline=val_pipeline,
-        label_file=label_file,
-        proposal_file=proposal_file_val,
-        data_prefix=dict(img=data_root),
-        test_mode=True))
-test_dataloader = val_dataloader
diff --git a/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
deleted file mode 100644
index c9e10def96..0000000000
--- a/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py']
-
-model = dict(
-    backbone=dict(
-        pretrained=(
-            'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly'
-            '/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
-            'kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-'
-            'steplr-150e_kinetics700-rgb_20220901-f73b3e89.pth')))
diff --git a/configs/detection/ava_kinetics/README.md b/configs/detection/ava_kinetics/README.md
deleted file mode 100644
index 59ec345c43..0000000000
--- a/configs/detection/ava_kinetics/README.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# AVA
-
-[The AVA-Kinetics Localized Human Actions Video Dataset](https://arxiv.org/abs/2005.00214)
-
-<!-- [ALGORITHM] -->
-
-<div align="center">
-  <img src="https://user-images.githubusercontent.com/35267818/205511687-8cafd48c-7f4a-4a4c-a8e6-8182635b0411.png" width="800px"/>
-</div>
-
-## Abstract
-
-<!-- [ABSTRACT] -->
-
-This paper describes the AVA-Kinetics localized human actions video dataset. The dataset is collected by annotating videos from the Kinetics-700 dataset using the AVA annotation protocol, and extending the original AVA dataset with these new AVA annotated Kinetics clips. The dataset contains over 230k clips annotated with the 80 AVA action classes for each of the humans in key-frames. We describe the annotation process and provide statistics about the new dataset. We also include a baseline evaluation using the Video Action Transformer Network on the AVA-Kinetics dataset, demonstrating improved performance for action classification on the AVA test set.
-
-```BibTeX
-@article{li2020ava,
-  title={The ava-kinetics localized human actions video dataset},
-  author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew},
-  journal={arXiv preprint arXiv:2005.00214},
-  year={2020}
-}
-```
-
-## Results and Models
-
-### AVA2.2
-
-Currently, we only use the training set of AVA-Kinetics and evaluate on the AVA2.2 validation dataset. The AVA-Kinetics validation dataset will be supported soon.
-
-| frame sampling strategy | resolution | gpus |     backbone      |   pretrain   |  mAP  |                    config                    |                    ckpt                     |                    log                     |
-| :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :------------------------------------------: | :-----------------------------------------: | :----------------------------------------: |
-|         4x16x1          |    raw     |  8   | SlowOnly ResNet50 | Kinetics-400 | 24.53 | [config](/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-33e3ca7c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) |
-|         4x16x1          |    raw     |  8   | SlowOnly ResNet50 | Kinetics-700 | 25.87 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-a07e8c15.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) |
-|          8x8x1          |    raw     |  8   | SlowOnly ResNet50 | Kinetics-400 | 26.10 | [config](/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-8f8dff3b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-|          8x8x1          |    raw     |  8   | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-
-### Training with tricks
-
-We conduct ablation studies to show the improvements of training tricks using SlowOnly8x8 pretrained on the Kinetics700 dataset. The baseline is the last raw in [AVA2.2](https://github.com/hukkai/mmaction2/tree/ava-kinetics-exp/configs/detection/ava_kinetics#ava22).
-
-|         method         | frame sampling strategy | resolution | gpus |     backbone      |   pretrain   |  mAP  |                config                 |                ckpt                 |                 log                 |
-| :--------------------: | :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :-----------------------------------: | :---------------------------------: | :---------------------------------: |
-|        baseline        |          8x8x1          |    raw     |  8   | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-|       + context        |          8x8x1          |    raw     |  8   | SlowOnly ResNet50 | Kinetics-700 | 28.31 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5d514f8c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-| + temporal max pooling |          8x8x1          |    raw     |  8   | SlowOnly ResNet50 | Kinetics-700 | 28.48 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5b5e71eb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-|    + nonlinear head    |          8x8x1          |    raw     |  8   | SlowOnly ResNet50 | Kinetics-700 | 29.83 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-87624265.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-|      + focal loss      |          8x8x1          |    raw     |  8   | SlowOnly ResNet50 | Kinetics-700 | 30.33 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb_20221205-37aa8395.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.log) |
-|     + more frames      |         16x4x1          |    raw     |  8   | SlowOnly ResNet50 | Kinetics-700 | 31.29 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb_20221205-dd652f81.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.log) |
-
-Note:
-
-The **gpus** indicates the number of gpu we used to get the checkpoint; **+ context** indicates that using both RoI feature and global pooled feature for classification; **+ temporal max pooling** indicates that using max pooling in the temporal dimension for the feature; **nonlinear head** indicates that using a 2-layer mlp instead of a linear classifier.
-
-For more details on data preparation, you can refer to [AVA-Kinetics Data Preparation](/tools/data/ava_kinetics/README.md).
-
-## Train
-
-You can use the following command to train a model.
-
-```shell
-python tools/train.py ${CONFIG_FILE} [optional arguments]
-```
-
-Example: train the SlowOnly model on AVA-Kinetics in a deterministic option.
-
-```shell
-python tools/train.py configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py \
-    --cfg-options randomness.seed=0 randomness.deterministic=True
-```
-
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
-
-## Test
-
-You can use the following command to test a model.
-
-```shell
-python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
-```
-
-Example: test the SlowOnly model on AVA-Kinetics and dump the result to a pkl file.
-
-```shell
-python tools/test.py configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py \
-    checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
-```
-
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
-
-## Citation
-
-<!-- [DATASET] -->
-
-```BibTeX
-@article{li2020ava,
-  title={The ava-kinetics localized human actions video dataset},
-  author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew},
-  journal={arXiv preprint arXiv:2005.00214},
-  year={2020}
-}
-```
diff --git a/configs/detection/slowfast/README.md b/configs/detection/slowfast/README.md
new file mode 100644
index 0000000000..bae71fd040
--- /dev/null
+++ b/configs/detection/slowfast/README.md
@@ -0,0 +1,96 @@
+# SlowFast
+
+[Slowfast networks for video recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143044111-94676f64-7ba8-4081-9011-f8054bed7030.png" width="800"/>
+</div>
+
+## Results and Models
+
+### AVA2.1
+
+| frame sampling strategy | gpus |             backbone             |   pretrain   |  mAP  |                   config                    |                   ckpt                    |                   log                    |
+| :---------------------: | :--: | :------------------------------: | :----------: | :---: | :-----------------------------------------: | :---------------------------------------: | :--------------------------------------: |
+|         4x16x1          |  8   |        SlowFast ResNet50         | Kinetics-400 | 24.32 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
+|         4x16x1          |  8   | SlowFast ResNet50 (with context) | Kinetics-400 | 25.34 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log) |
+|          8x8x1          |  8   |        SlowFast ResNet50         | Kinetics-400 | 25.80 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log) |
+
+### AVA2.2
+
+| frame sampling strategy | gpus |                 backbone                  |   pretrain   |  mAP  |                  config                  |                  ckpt                  |                  log                  |
+| :---------------------: | :--: | :---------------------------------------: | :----------: | :---: | :--------------------------------------: | :------------------------------------: | :-----------------------------------: |
+|          8x8x1          |  8   |             SlowFast ResNet50             | Kinetics-400 | 25.90 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
+|          8x8x1          |  8   |     SlowFast ResNet50 (temporal-max)      | Kinetics-400 | 26.41 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
+|          8x8x1          |  8   | SlowFast ResNet50 (temporal-max, focal loss) | Kinetics-400 | 26.65 | [config](/configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
+
+1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
+2. **with context** indicates that using both RoI feature and global pooled feature for classification; **temporal-max** indicates that using max pooling in the temporal dimension for the feature.
+
+For more details on data preparation, you can refer to [AVA](/tools/data/ava/README.md).
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train the SlowFast model on AVA2.1 in a deterministic option with periodic validation.
+
+```shell
+python tools/train.py configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
+    --seed 0 --deterministic
+```
+
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test the SlowFast model on AVA2.1 and dump the result to a pkl file.
+
+```shell
+python tools/test.py configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
+```
+
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+
+## Citation
+
+```BibTeX
+@inproceedings{feichtenhofer2019slowfast,
+  title={Slowfast networks for video recognition},
+  author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming},
+  booktitle={ICCV},
+  pages={6202--6211},
+  year={2019}
+}
+```
+
+```BibTeX
+@inproceedings{gu2018ava,
+  title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
+  author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
+  booktitle={CVPR},
+  pages={6047--6056},
+  year={2018}
+}
+```
diff --git a/configs/detection/slowfast/metafile.yml b/configs/detection/slowfast/metafile.yml
new file mode 100644
index 0000000000..2ab6c44a45
--- /dev/null
+++ b/configs/detection/slowfast/metafile.yml
@@ -0,0 +1,121 @@
+Collections:
+  - Name: SlowFast
+    README: configs/detection/slowfast/README.md
+    Paper:
+      URL: https://arxiv.org/abs/1812.03982
+      Title: 'SlowFast Networks for Video Recognition'
+
+Models:
+  - Name: slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
+    Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+    In Collection: SlowFast
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 16
+      Epochs: 20
+      Pretrained: Kinetics-400
+      Training Data: AVA v2.1
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.1
+        Task: Action Detection
+        Metrics:
+              mAP: 24.32
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth
+
+  - Name: slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb
+    Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py
+    In Collection: SlowFast
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 16
+      Epochs: 20
+      Pretrained: Kinetics-400
+      Training Data: AVA v2.1
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.1
+        Task: Action Detection
+        Metrics:
+              mAP: 25.34
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth
+
+  - Name: slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb
+    Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
+    In Collection: SlowFast
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 8
+      Epochs: 20
+      Pretrained: Kinetics-400
+      Training Data: AVA v2.1
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.1
+        Task: Action Detection
+        Metrics:
+              mAP: 25.80
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth
+
+  - Name: slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb
+    Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
+    In Collection: SlowFast
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 6
+      Epochs: 10
+      Pretrained: Kinetics-400
+      Training Data: AVA v2.2
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.2
+        Task: Action Detection
+        Metrics:
+              mAP: 25.90
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth
+
+  - Name: slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb
+    Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py
+    In Collection: SlowFast
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 6
+      Epochs: 10
+      Pretrained: Kinetics-400
+      Training Data: AVA v2.2
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.2
+        Task: Action Detection
+        Metrics:
+              mAP: 26.41
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth
+
+  - Name: slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb
+    Config: configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py
+    In Collection: SlowFast
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 6
+      Epochs: 10
+      Pretrained: Kinetics-400
+      Training Data: AVA v2.2
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.2
+        Task: Action Detection
+        Metrics:
+              mAP: 26.65
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth
diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py
similarity index 100%
rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py
rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py
diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py
similarity index 100%
rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py
rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py
diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
similarity index 52%
rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
index 8b5550aec0..0eb0e501e3 100644
--- a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
@@ -1,14 +1,16 @@
-_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py']
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/'
+       'slowfast_r50_4x16x1_256e_kinetics400_rgb/'
+       'slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth')
 
 model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
     backbone=dict(
-        _delete_=True,
-        type='ResNet3dSlowFast',
-        _scope_='mmaction',
-        pretrained=(
-            'https://download.openmmlab.com/mmaction/recognition/slowfast/'
-            'slowfast_r50_4x16x1_256e_kinetics400_rgb/'
-            'slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth'),
+        type='mmaction.ResNet3dSlowFast',
+        pretrained=None,
         resample_rate=8,
         speed_ratio=8,
         channel_ratio=8,
@@ -33,7 +35,39 @@
             conv1_stride_t=1,
             pool1_stride_t=1,
             spatial_strides=(1, 2, 2, 1))),
-    roi_head=dict(bbox_head=dict(in_channels=2304)))
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2304,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
 
 dataset_type = 'AVADataset'
 data_root = 'data/ava/rawframes'
@@ -51,9 +85,10 @@
                        'recall_93.9.pkl')
 proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
 
+file_client_args = dict(io_backend='disk')
 train_pipeline = [
     dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='RandomRescale', scale_range=(256, 320)),
     dict(type='RandomCrop', size=256),
     dict(type='Flip', flip_ratio=0.5),
@@ -65,7 +100,7 @@
 val_pipeline = [
     dict(
         type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='FormatShape', input_format='NCTHW', collapse=True),
     dict(type='PackActionInputs')
@@ -99,3 +134,36 @@
         data_prefix=dict(img=data_root),
         test_mode=True))
 test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='AVAMetric',
+    ann_file=ann_file_val,
+    label_file=label_file,
+    exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=20,
+        by_epoch=True,
+        milestones=[10, 15],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
similarity index 52%
rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
index a7f4c09ed1..debeb5c7fd 100644
--- a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
+++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
@@ -1,11 +1,74 @@
-_base_ = ['slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py']
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/'
+       'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
+       'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')
 
 model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
     backbone=dict(
-        pretrained=(
-            'https://download.openmmlab.com/mmaction/recognition/slowfast/'
-            'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
-            'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')))
+        type='mmaction.ResNet3dSlowFast',
+        resample_rate=4,
+        speed_ratio=4,
+        channel_ratio=8,
+        pretrained=None,
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=True,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1),
+            spatial_strides=(1, 2, 2, 1),
+            fusion_kernel=7),
+        fast_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=False,
+            base_channels=8,
+            conv1_kernel=(5, 7, 7),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            spatial_strides=(1, 2, 2, 1))),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2304,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
 
 dataset_type = 'AVADataset'
 data_root = 'data/ava/rawframes'
@@ -23,9 +86,10 @@
                        'recall_93.9.pkl')
 proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
 
+file_client_args = dict(io_backend='disk')
 train_pipeline = [
     dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='RandomRescale', scale_range=(256, 320)),
     dict(type='RandomCrop', size=256),
     dict(type='Flip', flip_ratio=0.5),
@@ -36,7 +100,7 @@
 val_pipeline = [
     dict(
         type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='FormatShape', input_format='NCTHW', collapse=True),
     dict(type='PackActionInputs')
@@ -80,6 +144,8 @@
 
 train_cfg = dict(
     type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
 
 param_scheduler = [
     dict(
@@ -102,3 +168,9 @@
 optim_wrapper = dict(
     optimizer=dict(type='SGD', lr=0.075, momentum=0.9, weight_decay=0.00001),
     clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (6 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
new file mode 100644
index 0000000000..1e94a10960
--- /dev/null
+++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
@@ -0,0 +1,171 @@
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/'
+       'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
+       'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')
+
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowFast',
+        resample_rate=4,
+        speed_ratio=4,
+        channel_ratio=8,
+        pretrained=None,
+        slow_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=True,
+            conv1_kernel=(1, 7, 7),
+            dilations=(1, 1, 1, 1),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            inflate=(0, 0, 1, 1),
+            spatial_strides=(1, 2, 2, 1),
+            fusion_kernel=7),
+        fast_pathway=dict(
+            type='resnet3d',
+            depth=50,
+            pretrained=None,
+            lateral=False,
+            base_channels=8,
+            conv1_kernel=(5, 7, 7),
+            conv1_stride_t=1,
+            pool1_stride_t=1,
+            spatial_strides=(1, 2, 2, 1))),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2304,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        data_prefix=dict(img=data_root)))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        data_prefix=dict(img=data_root),
+        test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='AVAMetric',
+    ann_file=ann_file_val,
+    label_file=label_file,
+    exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=20,
+        by_epoch=True,
+        milestones=[10, 15],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py
similarity index 100%
rename from configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py
rename to configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py
diff --git a/configs/detection/slowonly/README.md b/configs/detection/slowonly/README.md
new file mode 100644
index 0000000000..e8af3d84ea
--- /dev/null
+++ b/configs/detection/slowonly/README.md
@@ -0,0 +1,126 @@
+# SlowOnly
+
+[Slowfast networks for video recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/34324155/143044111-94676f64-7ba8-4081-9011-f8054bed7030.png" width="800"/>
+</div>
+
+## Results and Models
+
+### AVA2.1
+
+| frame sampling strategy | gpus |                backbone                |   pretrain   |  mAP  |                  config                   |                  ckpt                   |                  log                   |
+| :---------------------: | :--: | :------------------------------------: | :----------: | :---: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: |
+|         4x16x1          |  8   |           SlowOnly ResNet50            | Kinetics-400 | 20.72 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
+|         4x16x1          |  8   |           SlowOnly ResNet50            | Kinetics-700 | 22.77 | [config](/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
+|         4x16x1          |  8   | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 21.55 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log) |
+|          8x8x1          |  8   | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 23.77 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log) |
+|          8x8x1          |  8   |           SlowOnly ResNet101           | Kinetics-400 | 24.83 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log) |
+
+### AVA2.2 (Trained on AVA-Kinetics)
+
+Currently, we only use the training set of AVA-Kinetics and evaluate on the AVA2.2 validation dataset. The AVA-Kinetics validation dataset will be supported soon.
+
+| frame sampling strategy | gpus |     backbone      |   pretrain   |  mAP  |                      config                      |                      ckpt                      |                      log                      |
+| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: |
+|         4x16x1          |  8   | SlowOnly ResNet50 | Kinetics-400 | 24.53 | [config](/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-33e3ca7c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) |
+|         4x16x1          |  8   | SlowOnly ResNet50 | Kinetics-700 | 25.87 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-a07e8c15.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) |
+|          8x8x1          |  8   | SlowOnly ResNet50 | Kinetics-400 | 26.10 | [config](/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-8f8dff3b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+|          8x8x1          |  8   | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+
+### AVA2.2 (Trained on AVA-Kinetics with tricks)
+
+We conduct ablation studies to show the improvements of training tricks using SlowOnly8x8 pretrained on the Kinetics700 dataset. The baseline is the last row in **AVA2.2 (Trained on AVA-Kinetics)**.
+
+|         method         | frame sampling strategy | gpus |     backbone      |   pretrain   |  mAP  |                  config                  |                  ckpt                   |                  log                   |
+| :--------------------: | :---------------------: | :--: | :---------------: | :----------: | :---: | :--------------------------------------: | :-------------------------------------: | :------------------------------------: |
+|        baseline        |          8x8x1          |  8   | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+|       + context        |          8x8x1          |  8   | SlowOnly ResNet50 | Kinetics-700 | 28.31 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5d514f8c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+| + temporal max pooling |          8x8x1          |  8   | SlowOnly ResNet50 | Kinetics-700 | 28.48 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5b5e71eb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+|    + nonlinear head    |          8x8x1          |  8   | SlowOnly ResNet50 | Kinetics-700 | 29.83 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-87624265.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+|      + focal loss      |          8x8x1          |  8   | SlowOnly ResNet50 | Kinetics-700 | 30.33 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb_20221205-37aa8395.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.log) |
+|     + more frames      |         16x4x1          |  8   | SlowOnly ResNet50 | Kinetics-700 | 31.29 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb_20221205-dd652f81.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.log) |
+
+1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
+2. **+ context** indicates that using both RoI feature and global pooled feature for classification; **+ temporal max pooling** indicates that using max pooling in the temporal dimension for the feature; **nonlinear head** indicates that using a 2-layer mlp instead of a linear classifier.
+
+For more details on data preparation, you can refer to
+
+- [AVA](/tools/data/ava/README.md)
+- [AVA-Kinetics](/tools/data/ava_kinetics/README.md)
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train the SlowOnly model on AVA2.1 in a deterministic option with periodic validation.
+
+```shell
+python tools/train.py configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
+    --seed 0 --deterministic
+```
+
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test the SlowOnly model on AVA2.1 and dump the result to a pkl file.
+
+```shell
+python tools/test.py configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
+    checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
+```
+
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+
+## Citation
+
+```BibTeX
+@inproceedings{feichtenhofer2019slowfast,
+  title={Slowfast networks for video recognition},
+  author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming},
+  booktitle={ICCV},
+  pages={6202--6211},
+  year={2019}
+}
+```
+
+```BibTeX
+@inproceedings{gu2018ava,
+  title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
+  author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
+  booktitle={CVPR},
+  pages={6047--6056},
+  year={2018}
+}
+```
+
+```BibTeX
+@article{li2020ava,
+  title={The ava-kinetics localized human actions video dataset},
+  author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew},
+  journal={arXiv preprint arXiv:2005.00214},
+  year={2020}
+}
+```
diff --git a/configs/detection/slowonly/metafile.yml b/configs/detection/slowonly/metafile.yml
new file mode 100644
index 0000000000..11ca749351
--- /dev/null
+++ b/configs/detection/slowonly/metafile.yml
@@ -0,0 +1,102 @@
+Collections:
+  - Name: SlowOnly
+    README: configs/detection/slowonly/README.md
+    Paper:
+      URL: https://arxiv.org/abs/1812.03982
+      Title: 'SlowFast Networks for Video Recognition'
+
+Models:
+  - Name: slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
+    Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+    In Collection: SlowOnly
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 16
+      Epochs: 20
+      Pretrained: Kinetics-400
+      Training Data: AVA v2.1
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.1
+        Task: Action Detection
+        Metrics:
+              mAP: 20.72
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth
+
+  - Name: slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
+    Config: configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+    In Collection: SlowOnly
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 16
+      Epochs: 20
+      Pretrained: Kinetics-700
+      Training Data: AVA v2.1
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.1
+        Task: Action Detection
+        Metrics:
+              mAP: 22.77
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth
+
+  - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb
+    Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
+    In Collection: SlowOnly
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 16
+      Epochs: 20
+      Pretrained: Kinetics-400
+      Training Data: AVA v2.1
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.1
+        Task: Action Detection
+        Metrics:
+              mAP: 21.55
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth
+
+  - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb
+    Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
+    In Collection: SlowOnly
+    Metadata:
+      Architecture: ResNet50
+      Batch Size: 16
+      Epochs: 20
+      Pretrained: Kinetics-400
+      Training Data: AVA v2.1
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.1
+        Task: Action Detection
+        Metrics:
+              mAP: 23.77
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth
+
+  - Name: slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb
+    Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
+    In Collection: SlowOnly
+    Metadata:
+      Architecture: ResNet101
+      Batch Size: 16
+      Epochs: 20
+      Pretrained: Kinetics-400
+      Training Data: AVA v2.1
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+      - Dataset: AVA v2.1
+        Task: Action Detection
+        Metrics:
+              mAP: 24.83
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth
diff --git a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
similarity index 65%
rename from configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
index 7407ec6978..fd44f336ac 100644
--- a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
+++ b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
@@ -1,6 +1,58 @@
-_base_ = [
-    '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+       'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
+       'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+       'kinetics400-rgb_20220901-e7b65fad.pth')
+
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1)),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
 
 dataset_type = 'AVAKineticsDataset'
 data_root = 'data/ava_kinetics/rawframes'
@@ -18,14 +70,7 @@
                        'recall_93.9.pkl')
 proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
 
-# file_client_args = dict(
-#     io_backend='petrel',
-#     path_mapping=dict({
-#         'data/ava_kinetics/rawframes/':
-#         's3://openmmlab/datasets/action/ava/rawframes/'
-#     }))
 file_client_args = dict(io_backend='disk')
-
 train_pipeline = [
     dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
     dict(type='RawFrameDecode', **file_client_args),
diff --git a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
similarity index 70%
rename from configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
index eb393d3a8c..4af750e8ad 100644
--- a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
+++ b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
@@ -1,13 +1,58 @@
-_base_ = [
-    '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
+_base_ = '../../_base_/default_runtime.py'
 
 url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
        'slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-'
        'rgb/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_'
        'kinetics400-rgb_20220901-df42dc84.pth')
 
-model = dict(init_cfg=dict(type='Pretrained', checkpoint=url))
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1)),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
 
 dataset_type = 'AVAKineticsDataset'
 data_root = 'data/ava_kinetics/rawframes'
@@ -25,14 +70,7 @@
                        'recall_93.9.pkl')
 proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
 
-# file_client_args = dict(
-#     io_backend='petrel',
-#     path_mapping=dict({
-#         'data/ava_kinetics/rawframes/':
-#         's3://openmmlab/datasets/action/ava/rawframes/'
-#     }))
 file_client_args = dict(io_backend='disk')
-
 train_pipeline = [
     dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
     dict(type='RawFrameDecode', **file_client_args),
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py
similarity index 85%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py
index 4d4a3dea6b..a757f731a4 100644
--- a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py
+++ b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py
@@ -1,14 +1,6 @@
-_base_ = [
-    '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
-
-url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
-       'slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-'
-       'rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_'
-       'kinetics700-rgb_20221013-15b93b10.pth')
+_base_ = ['slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py']
 
 model = dict(
-    init_cfg=dict(type='Pretrained', checkpoint=url),
     roi_head=dict(
         bbox_roi_extractor=dict(with_global=True, temporal_pool_mode='max'),
         bbox_head=dict(in_channels=4096, mlp_head=True, focal_gamma=1.0)))
@@ -29,14 +21,7 @@
                        'recall_93.9.pkl')
 proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
 
-# file_client_args = dict(
-#     io_backend='petrel',
-#     path_mapping=dict({
-#         'data/ava_kinetics/rawframes/':
-#         's3://openmmlab/datasets/action/ava/rawframes/'
-#     }))
 file_client_args = dict(io_backend='disk')
-
 train_pipeline = [
     dict(type='SampleAVAFrames', clip_len=16, frame_interval=4),
     dict(type='RawFrameDecode', **file_client_args),
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
new file mode 100644
index 0000000000..9bee13a25c
--- /dev/null
+++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
@@ -0,0 +1,151 @@
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+       'omni/slowonly_r101_without_omni_8x8x1_kinetics400_rgb_'
+       '20200926-0c730aef.pth')
+
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowOnly',
+        depth=101,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1)),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        data_prefix=dict(img=data_root)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        data_prefix=dict(img=data_root),
+        test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='AVAMetric',
+    ann_file=ann_file_val,
+    label_file=label_file,
+    exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=20,
+        by_epoch=True,
+        milestones=[10, 15],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
new file mode 100644
index 0000000000..cdc8ea8d98
--- /dev/null
+++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
@@ -0,0 +1,160 @@
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+       'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb/'
+       'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb_'
+       '20210308-0d6e5a69.pth')
+
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1),
+        norm_cfg=dict(type='BN3d', requires_grad=True),
+        non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
+        non_local_cfg=dict(
+            sub_sample=True,
+            use_scale=True,
+            norm_cfg=dict(type='BN3d', requires_grad=True),
+            mode='embedded_gaussian')),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        data_prefix=dict(img=data_root)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        data_prefix=dict(img=data_root),
+        test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='AVAMetric',
+    ann_file=ann_file_val,
+    label_file=label_file,
+    exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=20,
+        by_epoch=True,
+        milestones=[10, 15],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
new file mode 100644
index 0000000000..9b6dd00fdb
--- /dev/null
+++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
@@ -0,0 +1,159 @@
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+       'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb/'
+       'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb_'
+       '20210308-e8dd9e82.pth')
+
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1),
+        norm_cfg=dict(type='BN3d', requires_grad=True),
+        non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
+        non_local_cfg=dict(
+            sub_sample=True,
+            use_scale=True,
+            norm_cfg=dict(type='BN3d', requires_grad=True),
+            mode='embedded_gaussian')),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        data_prefix=dict(img=data_root)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        data_prefix=dict(img=data_root),
+        test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='AVAMetric',
+    ann_file=ann_file_val,
+    label_file=label_file,
+    exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=20,
+        by_epoch=True,
+        milestones=[10, 15],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
similarity index 56%
rename from configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
rename to configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
index ec107941b3..a83408c84a 100644
--- a/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
@@ -1,6 +1,58 @@
-_base_ = [
-    '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+       'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
+       'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+       'kinetics400-rgb_20220901-e7b65fad.pth')
+
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1)),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
 
 dataset_type = 'AVADataset'
 data_root = 'data/ava/rawframes'
@@ -18,9 +70,10 @@
                        'recall_93.9.pkl')
 proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
 
+file_client_args = dict(io_backend='disk')
 train_pipeline = [
     dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='RandomRescale', scale_range=(256, 320)),
     dict(type='RandomCrop', size=256),
     dict(type='Flip', flip_ratio=0.5),
@@ -31,7 +84,7 @@
 val_pipeline = [
     dict(
         type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='FormatShape', input_format='NCTHW', collapse=True),
     dict(type='PackActionInputs')
@@ -92,3 +145,9 @@
 optim_wrapper = dict(
     optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
     clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
new file mode 100644
index 0000000000..a68893a015
--- /dev/null
+++ b/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
@@ -0,0 +1,153 @@
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly'
+       '/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+       'kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-'
+       'steplr-150e_kinetics700-rgb_20220901-f73b3e89.pth')
+
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1)),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5)),
+    data_preprocessor=dict(
+        type='mmaction.ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+                       'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='RandomRescale', scale_range=(256, 320)),
+    dict(type='RandomCrop', size=256),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode', **file_client_args),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        exclude_file=exclude_file_train,
+        pipeline=train_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_train,
+        data_prefix=dict(img=data_root)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        exclude_file=exclude_file_val,
+        pipeline=val_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_val,
+        data_prefix=dict(img=data_root),
+        test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='AVAMetric',
+    ann_file=ann_file_val,
+    label_file=label_file,
+    exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=20,
+        by_epoch=True,
+        milestones=[10, 15],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/mmaction/models/backbones/resnet3d_slowonly.py b/mmaction/models/backbones/resnet3d_slowonly.py
index 3a2a3a3ac0..5c1c71c4c2 100644
--- a/mmaction/models/backbones/resnet3d_slowonly.py
+++ b/mmaction/models/backbones/resnet3d_slowonly.py
@@ -4,12 +4,6 @@
 from mmaction.registry import MODELS
 from .resnet3d_slowfast import ResNet3dPathway
 
-try:
-    from mmdet.registry import MODELS as MMDET_MODELS
-    mmdet_imported = True
-except (ImportError, ModuleNotFoundError):
-    mmdet_imported = False
-
 
 @MODELS.register_module()
 class ResNet3dSlowOnly(ResNet3dPathway):
@@ -43,7 +37,3 @@ def __init__(self,
             **kwargs)
 
         assert not self.lateral
-
-
-if mmdet_imported:
-    MMDET_MODELS.register_module()(ResNet3dSlowOnly)
diff --git a/model-index.yml b/model-index.yml
index a41addf98d..ebf462e3f9 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -15,7 +15,8 @@ Import:
 - configs/recognition/trn/metafile.yml
 - configs/recognition/swin/metafile.yml
 - configs/recognition/c2d/metafile.yml
-- configs/detection/ava/metafile.yml
+- configs/detection/slowfast/metafile.yml
+- configs/detection/slowonly/metafile.yml
 - configs/detection/acrn/metafile.yml
 - configs/skeleton/stgcn/metafile.yml
 - configs/skeleton/2s-agcn/metafile.yml

From 8b9313a06f727672cd1c5e3d67de90a277b5019e Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Thu, 30 Mar 2023 10:53:37 +0800
Subject: [PATCH 18/36] [Doc]: Add more social networking links (#2321)

---
 README.md       | 8 +++++++-
 README_zh-CN.md | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f3a575f4ce..d08d49d2c3 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@
 
 <div align="center">
   <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
-    <img src="https://user-images.githubusercontent.com/25839884/218352562-cdded397-b0f3-4ca1-b8dd-a60df8dca75b.png" width="3%" alt="" /></a>
+    <img src="https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png" width="3%" alt="" /></a>
   <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
   <a href="https://discord.com/channels/1037617289144569886/1046608014234370059" style="text-decoration:none;">
     <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
@@ -46,6 +46,12 @@
   <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
   <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
     <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://space.bilibili.com/1293512903" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.zhihu.com/people/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png" width="3%" alt="" /></a>
 </div>
 
 English | [简体中文](/README_zh-CN.md)
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 5d0d091cd1..c2ffb09702 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -36,7 +36,7 @@
 
 <div align="center">
   <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
-    <img src="https://user-images.githubusercontent.com/25839884/218352562-cdded397-b0f3-4ca1-b8dd-a60df8dca75b.png" width="3%" alt="" /></a>
+    <img src="https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png" width="3%" alt="" /></a>
   <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
   <a href="https://discord.com/channels/1037617289144569886/1046608014234370059" style="text-decoration:none;">
     <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
@@ -46,6 +46,12 @@
   <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
   <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
     <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://space.bilibili.com/1293512903" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.zhihu.com/people/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png" width="3%" alt="" /></a>
 </div>
 
 [English](/README.md) | 简体中文

From f30e8a45c3819037d617722742e6b14145372e70 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Thu, 30 Mar 2023 10:56:38 +0800
Subject: [PATCH 19/36] [Fix] Fix accepting an unexpected argument local-rank
 in PyTorch 2.0 (#2320)

---
 tools/misc/clip_feature_extraction.py | 5 ++++-
 tools/test.py                         | 2 +-
 tools/train.py                        | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/misc/clip_feature_extraction.py b/tools/misc/clip_feature_extraction.py
index 1829bf9b5c..a7a3e67635 100644
--- a/tools/misc/clip_feature_extraction.py
+++ b/tools/misc/clip_feature_extraction.py
@@ -59,7 +59,10 @@ def parse_args():
         choices=['none', 'pytorch', 'slurm', 'mpi'],
         default='none',
         help='job launcher')
-    parser.add_argument('--local_rank', type=int, default=0)
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)
diff --git a/tools/test.py b/tools/test.py
index 0d0d4bd20f..4f310fa9e0 100644
--- a/tools/test.py
+++ b/tools/test.py
@@ -51,7 +51,7 @@ def parse_args():
         choices=['none', 'pytorch', 'slurm', 'mpi'],
         default='none',
         help='job launcher')
-    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)
diff --git a/tools/train.py b/tools/train.py
index 2c51c50709..e43078ddb8 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -56,7 +56,7 @@ def parse_args():
         choices=['none', 'pytorch', 'slurm', 'mpi'],
         default='none',
         help='job launcher')
-    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)

From 264836c8bcfcc0010aa98044062a6ffae9645449 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Fri, 31 Mar 2023 10:55:42 +0800
Subject: [PATCH 20/36] [doc] add opendatalab kinetics link (#2292)

---
 tools/data/kinetics/README.md       | 13 ++++++++++++-
 tools/data/kinetics/README_zh-CN.md | 16 ++++++++++++++--
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/tools/data/kinetics/README.md b/tools/data/kinetics/README.md
index 4fc7b6bb1e..0df8f8634f 100644
--- a/tools/data/kinetics/README.md
+++ b/tools/data/kinetics/README.md
@@ -24,6 +24,8 @@ Because of the expirations of some YouTube links, the sizes of kinetics dataset
 |   Dataset   | training videos | validation videos |
 | :---------: | :-------------: | :---------------: |
 | kinetics400 |     240436      |       19796       |
+| Kinetics600 |     383393      |       27910       |
+| Kinetics700 |     542357      |       34824       |
 
 :::
 
@@ -46,7 +48,16 @@ bash download_backup_annotations.sh ${DATASET}
 
 ## Step 2. Prepare Videos
 
-Then, you can run the following script to prepare videos.
+### Option 1: Download from OpenDataLab
+
+**Recommend**: [OpenDataLab](https://opendatalab.com/) provides the Kinetics dataset ([Kinetics400](https://opendatalab.com/Kinetics-400), [Kinetics600](https://opendatalab.com/Kinetics600), [Kinetics700](https://opendatalab.com/Kinetics_700)), users can download Kinetics dataset with short edge 320 pixels from here.
+
+:::{note}
+All experiments on Kinetics in MMAction2 are based on this version, we recommend users to try this version.
+
+### Option 2: Download from Other Source
+
+you can run the following script to prepare videos.
 The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time.
 
 ```shell
diff --git a/tools/data/kinetics/README_zh-CN.md b/tools/data/kinetics/README_zh-CN.md
index e307b9e7f5..86cb65239e 100644
--- a/tools/data/kinetics/README_zh-CN.md
+++ b/tools/data/kinetics/README_zh-CN.md
@@ -18,11 +18,14 @@
 请参照 [官方网站](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) 以获取数据集基本信息。此脚本用于准备数据集 kinetics400，kinetics600，kinetics700。为准备 kinetics 数据集的不同版本，用户需将脚本中的 `${DATASET}` 赋值为数据集对应版本名称，可选项为 `kinetics400`，`kinetics600`， `kinetics700`。
 在开始之前，用户需确保当前目录为 `$MMACTION2/tools/data/${DATASET}/`。
 
-**注**：由于部分 YouTube 链接失效，爬取的 Kinetics 数据集大小可能与原版不同。以下是我们所使用 Kinetics 数据集的大小：
+:::{note}
+由于部分 YouTube 链接失效，爬取的 Kinetics 数据集大小可能与原版不同。以下是我们所使用 Kinetics 数据集的大小：
 
 |   数据集    | 训练视频 | 验证集视频 |
 | :---------: | :------: | :--------: |
-| kinetics400 |  240436  |   19796    |
+| Kinetics400 |  240436  |   19796    |
+| Kinetics600 |  383393  |   27910    |
+| Kinetics700 |  542357  |   34824    |
 
 ## 1. 准备标注文件
 
@@ -42,6 +45,15 @@ bash download_backup_annotations.sh ${DATASET}
 
 ## 2. 准备视频
 
+### 选项 1: 从 OpenDataLab 下载
+
+**推荐**：[OpenDataLab](https://opendatalab.com/) 提供了 Kinetics 数据集 ([Kinetics400](https://opendatalab.com/Kinetics-400), [Kinetics600](https://opendatalab.com/Kinetics600), [Kinetics700](https://opendatalab.com/Kinetics_700)), 用户可以从这里下载短边长度为 320 的 Kinetics 数据集。
+
+:::{note}
+MMAction2 代码仓库中提供的 Kinetics 实验性能，都是基于这个版本的数据得到的。我们建议用户使用这个版本的 Kinetics 数据集进行实验。
+
+### 选项 2：从其他数据源下载
+
 用户可以使用以下脚本准备视频，视频准备代码修改自 [官方爬虫](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)。注意这一步骤将花费较长时间。
 
 ```shell

From ebf4d012be4fb413af03b90400827ba839cd1c0d Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Tue, 4 Apr 2023 14:48:52 +0800
Subject: [PATCH 21/36] [Fix] fix mobilenetv2_tsm (#2332)

---
 .circleci/test.yml                            |   1 -
 ...lenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py | 125 ++++++++++++++++++
 mmaction/models/backbones/mobilenet_v2_tsm.py |   8 +-
 mmaction/models/backbones/resnet_tsm.py       |   1 +
 tests/models/recognizers/test_recognizer2d.py |   9 ++
 5 files changed, 141 insertions(+), 3 deletions(-)
 create mode 100644 configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py

diff --git a/.circleci/test.yml b/.circleci/test.yml
index 3984767a12..aafba494dd 100644
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@@ -44,7 +44,6 @@ jobs:
       - run:
           name: Install Libraries
           command: |
-            sudo add-apt-repository ppa:savoury1/ffmpeg4
             sudo apt-get update
             sudo apt-get upgrade
             sudo apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libturbojpeg pkg-config
diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py
new file mode 100644
index 0000000000..32c276647f
--- /dev/null
+++ b/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py
@@ -0,0 +1,125 @@
+_base_ = [
+    '../../_base_/models/tsm_mobilenet_v2.py',
+    '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='MultiScaleCrop',
+        input_size=224,
+        scales=(1, 0.875, 0.75, 0.66),
+        random_crop=False,
+        max_wh_scale_gap=1,
+        num_fixed_crops=13),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=8,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='TenCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3))
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=50,
+        by_epoch=True,
+        milestones=[25, 45],
+        gamma=0.1)
+]
+
+optim_wrapper = dict(
+    constructor='TSMOptimWrapperConstructor',
+    paramwise_cfg=dict(fc_lr5=True),
+    optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/mmaction/models/backbones/mobilenet_v2_tsm.py b/mmaction/models/backbones/mobilenet_v2_tsm.py
index db2999a8b3..2df95ab47c 100644
--- a/mmaction/models/backbones/mobilenet_v2_tsm.py
+++ b/mmaction/models/backbones/mobilenet_v2_tsm.py
@@ -21,6 +21,8 @@ def __init__(self, num_segments=8, is_shift=True, shift_div=8, **kwargs):
         self.num_segments = num_segments
         self.is_shift = is_shift
         self.shift_div = shift_div
+        super().init_weights()
+        self.init_structure()
 
     def make_temporal_shift(self):
         """Make temporal shift for some layers."""
@@ -33,9 +35,11 @@ def make_temporal_shift(self):
                     shift_div=self.shift_div,
                 )
 
-    def init_weights(self):
+    def init_structure(self):
         """Initiate the parameters either from existing checkpoint or from
         scratch."""
-        super().init_weights()
         if self.is_shift:
             self.make_temporal_shift()
+
+    def init_weights(self):
+        pass
diff --git a/mmaction/models/backbones/resnet_tsm.py b/mmaction/models/backbones/resnet_tsm.py
index 1397384a97..c639e1eae6 100644
--- a/mmaction/models/backbones/resnet_tsm.py
+++ b/mmaction/models/backbones/resnet_tsm.py
@@ -165,6 +165,7 @@ def __init__(self,
         self.non_local = non_local
         self.non_local_stages = _ntuple(self.num_stages)(non_local)
         self.non_local_cfg = non_local_cfg
+        # TODO use convert key to load weights
         super().init_weights()
         self.init_structure()
 
diff --git a/tests/models/recognizers/test_recognizer2d.py b/tests/models/recognizers/test_recognizer2d.py
index a1c8ef4b1f..1acde7fc9c 100644
--- a/tests/models/recognizers/test_recognizer2d.py
+++ b/tests/models/recognizers/test_recognizer2d.py
@@ -104,11 +104,20 @@ def test_tsn():
 
 def test_tsm():
     register_all_modules()
+    config = get_recognizer_cfg(
+        'tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py'  # noqa: E501
+    )
+    config.model['backbone']['pretrained'] = None
+
+    recognizer = MODELS.build(config.model)
+    recognizer.init_weights()
+
     config = get_recognizer_cfg(
         'tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py')
     config.model['backbone']['pretrained'] = None
 
     recognizer = MODELS.build(config.model)
+    recognizer.init_weights()
 
     input_shape = (1, 8, 3, 32, 32)
     demo_inputs = generate_recognizer_demo_inputs(input_shape)

From e07c5e3c99a68c80cc807d83a5480f3694410fd0 Mon Sep 17 00:00:00 2001
From: LinXiaoZheng <90811472+Zheng-LinXiao@users.noreply.github.com>
Date: Tue, 4 Apr 2023 14:49:50 +0800
Subject: [PATCH 22/36] [Improve] use mmengine to calculate FLOPs (#2300)

---
 tools/analysis_tools/get_flops.py | 49 ++++++++++---------------------
 1 file changed, 15 insertions(+), 34 deletions(-)

diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py
index b89f5db5ad..fbec21887f 100644
--- a/tools/analysis_tools/get_flops.py
+++ b/tools/analysis_tools/get_flops.py
@@ -1,21 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
 
-import torch
-
-try:
-    from fvcore.nn import (ActivationCountAnalysis, FlopCountAnalysis,
-                           flop_count_str, flop_count_table, parameter_count)
-except ImportError:
-    print('You may need to install fvcore for flops computation, '
-          'and you can use `pip install -r requirements/optional.txt` '
-          'to set up the environment')
-from fvcore.nn.print_model_statistics import _format_size
 from mmengine import Config
 from mmengine.registry import init_default_scope
 
 from mmaction.registry import MODELS
 
+try:
+    from mmengine.analysis import get_model_complexity_info
+except ImportError:
+    raise ImportError('Please upgrade mmcv to >0.6.2')
+
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Get model flops and params')
@@ -39,17 +34,17 @@ def main():
     elif len(args.shape) == 2:
         input_shape = (1, 3) + tuple(args.shape)
     elif len(args.shape) == 4:
-        # n, c, h, w = args.shape
+        # n, c, h, w = args.shape for 2D recognizer
         input_shape = tuple(args.shape)
     elif len(args.shape) == 5:
-        # n, c, t, h, w = args.shape
+        # n, c, t, h, w = args.shape for 3D recognizer or
+        # n, m, t, v, c = args.shape for GCN-based recognizer
         input_shape = tuple(args.shape)
     else:
         raise ValueError('invalid input shape')
 
     cfg = Config.fromfile(args.config)
     init_default_scope(cfg.get('default_scope', 'mmaction'))
-
     model = MODELS.build(cfg.model)
     model.eval()
 
@@ -60,28 +55,14 @@ def main():
             'FLOPs counter is currently not currently supported with {}'.
             format(model.__class__.__name__))
 
-    inputs = (torch.randn((1, *input_shape)), )
-    flops_ = FlopCountAnalysis(model, inputs)
-    activations_ = ActivationCountAnalysis(model, inputs)
-
-    flops = _format_size(flops_.total())
-    activations = _format_size(activations_.total())
-    params = _format_size(parameter_count(model)[''])
-
-    flop_table = flop_count_table(
-        flops=flops_,
-        activations=activations_,
-        show_param_shapes=True,
-    )
-    flop_str = flop_count_str(flops=flops_, activations=activations_)
-
-    print('\n' + flop_str)
-    print('\n' + flop_table)
-
+    analysis_results = get_model_complexity_info(model, input_shape)
+    flops = analysis_results['flops_str']
+    params = analysis_results['params_str']
+    table = analysis_results['out_table']
+    print(table)
     split_line = '=' * 30
-    print(f'{split_line}\nInput shape: {input_shape}\n'
-          f'Flops: {flops}\nParams: {params}\n'
-          f'Activation: {activations}\n{split_line}')
+    print(f'\n{split_line}\nInput shape: {input_shape}\n'
+          f'Flops: {flops}\nParams: {params}\n{split_line}')
     print('!!!Please be cautious if you use the results in papers. '
           'You may need to check if all ops are supported and verify that the '
           'flops computation is correct.')

From b046879db24b7091cd95440d26d02ffac867f0d4 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Tue, 4 Apr 2023 15:17:07 +0800
Subject: [PATCH 23/36] [Fix] update aciton docker image to ubuntu-22.04
 (#2334)

---
 .github/workflows/merge_stage_test.yml | 30 ++++++++++----------
 .github/workflows/pr_stage_test.yml    | 39 ++++++++++++++------------
 2 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml
index 8c9862d049..0d1daed059 100644
--- a/.github/workflows/merge_stage_test.yml
+++ b/.github/workflows/merge_stage_test.yml
@@ -18,7 +18,7 @@ concurrency:
 
 jobs:
   build_cpu_py:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-22.04
     strategy:
       matrix:
         python-version: [3.8, 3.9]
@@ -27,9 +27,9 @@ jobs:
           - torch: 1.8.1
             torchvision: 0.9.1
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
            python-version: ${{ matrix.python-version }}
       - name: Upgrade pip
@@ -50,7 +50,7 @@ jobs:
       - name: Install unittest dependencies
         run: pip install -r requirements.txt
       - name: Install PyTorch
-        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
       - name: Install MMEngine
         run: pip install git+https://github.com/open-mmlab/mmengine.git@main
       - name: Install MMCV
@@ -75,7 +75,7 @@ jobs:
           coverage report -m
 
   build_cpu_pt:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-22.04
     strategy:
       matrix:
         python-version: [3.7]
@@ -96,9 +96,9 @@ jobs:
           - torch: 1.12.1
             torchvision: 0.13.1
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
            python-version: ${{ matrix.python-version }}
       - name: Upgrade pip
@@ -118,7 +118,7 @@ jobs:
       - name: Install TurboJpeg lib
         run: sudo apt-get install -y libturbojpeg
       - name: Install PyTorch
-        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
       - name: Install MMEngine
         run: pip install git+https://github.com/open-mmlab/mmengine.git@main
       - name: Install MMCV
@@ -153,7 +153,7 @@ jobs:
           fail_ci_if_error: false
 
   build_cu102:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-22.04
     container:
       image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel
     strategy:
@@ -163,9 +163,9 @@ jobs:
           - torch: 1.8.1
             cuda: 10.2
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Upgrade pip
@@ -201,16 +201,16 @@ jobs:
           TORCH_CUDA_ARCH_LIST=7.0 pip install -e .
 
   build_windows:
-    runs-on: ${{ matrix.os }}
+    runs-on: windows-2022
     strategy:
       matrix:
         os: [windows-2022]
         python: [3.7]
         platform: [cpu, cu111]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python }}
       - name: Upgrade pip
@@ -220,7 +220,7 @@ jobs:
       - name: Install lmdb
         run: pip install lmdb
       - name: Install PyTorch
-        run: pip install torch==1.8.1+${{matrix.platform}} torchvision==0.9.1+${{matrix.platform}} -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+        run: pip install torch==${{matrix.torch}}+${{matrix.platform}} torchvision==${{matrix.torchvision}}+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
       - name: Install mmaction dependencies
         run: |
           pip install git+https://github.com/open-mmlab/mmengine.git@main
diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml
index 74c1145b5c..a0eb9d5d00 100644
--- a/.github/workflows/pr_stage_test.yml
+++ b/.github/workflows/pr_stage_test.yml
@@ -16,7 +16,7 @@ concurrency:
 
 jobs:
   build_cpu:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-22.04
     strategy:
       matrix:
         python-version: [3.7]
@@ -24,9 +24,9 @@ jobs:
           - torch: 1.8.1
             torchvision: 0.9.1
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
            python-version: ${{ matrix.python-version }}
       - name: Upgrade pip
@@ -40,7 +40,7 @@ jobs:
       - name: Install TurboJpeg lib
         run: sudo apt-get install -y libturbojpeg
       - name: Install PyTorch
-        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
       - name: Install MMEngine
         run: pip install git+https://github.com/open-mmlab/mmengine.git@main
       - name: Install MMCV
@@ -77,9 +77,11 @@ jobs:
           fail_ci_if_error: false
 
   build_cu102:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-22.04
     container:
       image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel
+    env:
+      MKL_THREADING_LAYER: GNU
     strategy:
       matrix:
         python-version: [3.7]
@@ -87,9 +89,9 @@ jobs:
           - torch: 1.8.1
             cuda: 10.2
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Upgrade pip
@@ -105,9 +107,9 @@ jobs:
         run: |
           apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libturbojpeg libsndfile1 libsm6 libxrender-dev libxext6
       - name: Install librosa and soundfile
-        run: python -m pip install librosa soundfile
+        run: pip install librosa soundfile
       - name: Install lmdb
-        run: python -m pip install lmdb
+        run: pip install lmdb
       - name: Install mmaction dependencies
         run: |
           pip install git+https://github.com/open-mmlab/mmengine.git@main
@@ -117,12 +119,11 @@ jobs:
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
           pip install -r requirements.txt
       - name: Install PytorchVideo
-        run: python -m pip install pytorchvideo
+        run: pip install pytorchvideo
         if: ${{matrix.cuda == '10.2'}}
       - name: Build and install
         run: |
-          python setup.py check -m -s
-          TORCH_CUDA_ARCH_LIST=7.0 pip install -e .
+          pip install -e . -v
       - name: Run unittests and generate coverage report
         run: |
           coverage run --branch --source mmaction -m pytest tests/ -k 'not timm'
@@ -130,16 +131,18 @@ jobs:
           coverage report -m
 
   build_windows:
-    runs-on: ${{ matrix.os }}
+    runs-on: windows-2022
     strategy:
       matrix:
         os: [windows-2022]
-        python: [3.7]
+        python-version: [3.7]
+        torch: [1.8.1]
+        torchvision: [0.9.1]
         platform: [cpu, cu111]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python }}
       - name: Upgrade pip
@@ -151,7 +154,7 @@ jobs:
       - name: Install lmdb
         run: pip install lmdb
       - name: Install PyTorch
-        run: pip install torch==1.8.1+${{matrix.platform}} torchvision==0.9.1+${{matrix.platform}} -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+        run: pip install torch==${{matrix.torch}}+${{matrix.platform}} torchvision==${{matrix.torchvision}}+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
       - name: Install timm
         run: python -m pip install timm
       - name: Install mmaction dependencies
@@ -166,7 +169,7 @@ jobs:
         run: python -m pip install pytorchvideo
       - name: Build and install
         run: |
-          pip install -e .
+          pip install -e . -v
       - name: Run unittests and generate coverage report
         run: |
           pytest tests/

From 97f0e637b8c0e3be00db1d7b3be241d087b6f511 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Tue, 4 Apr 2023 16:36:39 +0800
Subject: [PATCH 24/36] [Fix] fix merge stage test (#2336)

---
 .github/workflows/merge_stage_test.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml
index 0d1daed059..cf1f2ed10c 100644
--- a/.github/workflows/merge_stage_test.yml
+++ b/.github/workflows/merge_stage_test.yml
@@ -36,7 +36,6 @@ jobs:
         run: pip install pip --upgrade
       - name: Install Libraries
         run: |
-          sudo add-apt-repository ppa:savoury1/ffmpeg4
           sudo apt-get update
           sudo apt-get upgrade
           sudo apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev libturbojpeg pkg-config
@@ -65,7 +64,7 @@ jobs:
         run: pip install pytorchvideo
         if: ${{matrix.torchvision == '0.10.0'}}
       - name: Install timm
-        run: python -m pip install timm
+        run: pip install timm
       - name: Build and install
         run: rm -rf .eggs && pip install -e .
       - name: Run unittests and generate coverage report
@@ -110,10 +109,10 @@ jobs:
       - name: Install lmdb
         run: pip install lmdb
       - name: Install timm
-        run: python -m pip install timm==0.6.7
+        run: pip install timm==0.6.7
         if: ${{matrix.torch == '1.6.0'}}
       - name: Install timm
-        run: python -m pip install timm
+        run: pip install timm
         if: ${{matrix.torch != '1.6.0'}}
       - name: Install TurboJpeg lib
         run: sudo apt-get install -y libturbojpeg
@@ -181,9 +180,9 @@ jobs:
         run: |
           apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libturbojpeg libsndfile1 libsm6 libxrender-dev libxext6
       - name: Install librosa and soundfile
-        run: python -m pip install librosa soundfile
+        run: pip install librosa soundfile
       - name: Install lmdb
-        run: python -m pip install lmdb
+        run: pip install lmdb
       - name: Install mmaction dependencies
         run: |
           pip install git+https://github.com/open-mmlab/mmengine.git@main
@@ -193,12 +192,11 @@ jobs:
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
           pip install -r requirements.txt
       - name: Install PytorchVideo
-        run: python -m pip install pytorchvideo
+        run: pip install pytorchvideo
         if: ${{matrix.cuda == '10.2'}}
       - name: Build and install
         run: |
-          python setup.py check -m -s
-          TORCH_CUDA_ARCH_LIST=7.0 pip install -e .
+          pip install -e .
 
   build_windows:
     runs-on: windows-2022
@@ -207,6 +205,8 @@ jobs:
         os: [windows-2022]
         python: [3.7]
         platform: [cpu, cu111]
+        torch: [1.8.1]
+        torchvision: [0.9.1]
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python }}

From 9c37c22361773ff90ffc8f4a6486cfb4526fd2e6 Mon Sep 17 00:00:00 2001
From: wxDai <daiwenxun@pjlab.org.cn>
Date: Thu, 6 Apr 2023 11:23:45 +0800
Subject: [PATCH 25/36] [Docs] Add 20 Minutes Guide (#2325)

---
 docs/en/guide_to_framework.md | 760 ++++++++++++++++++++++++++++++++++
 docs/en/index.rst             |   1 +
 2 files changed, 761 insertions(+)
 create mode 100644 docs/en/guide_to_framework.md

diff --git a/docs/en/guide_to_framework.md b/docs/en/guide_to_framework.md
new file mode 100644
index 0000000000..68f8bdfd41
--- /dev/null
+++ b/docs/en/guide_to_framework.md
@@ -0,0 +1,760 @@
+# A 20-Minute Guide to MMAction2 FrameWork
+
+In this tutorial, we will demonstrate the overall architecture of our `MMACTION2 1.0` through a step-by-step example of video action recognition.
+
+The structure of this tutorial is as follows:
+
+- [A 20-Minute Guide to MMAction2 FrameWork](#a-20-minute-guide-to-mmaction2-framework)
+  - [Step0: Prepare Data](#step0-prepare-data)
+  - [Step1: Build a Pipeline](#step1-build-a-pipeline)
+  - [Step2: Build a Dataset and DataLoader](#step2-build-a-dataset-and-dataloader)
+  - [Step3: Build a Recognizer](#step3-build-a-recognizer)
+  - [Step4: Build a Evaluation Metric](#step4-build-a-evaluation-metric)
+  - [Step5: Train and Test with Native PyTorch](#step5-train-and-test-with-native-pytorch)
+  - [Step6: Train and Test with MMEngine (Recommended)](#step6-train-and-test-with-mmengine-recommended)
+
+First, we need to initialize the `scope` for registry, to ensure that each module is registered under the scope of `mmaction`. For more detailed information about registry, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html).
+
+```python
+from mmaction.utils import register_all_modules
+
+register_all_modules(init_default_scope=True)
+```
+
+## Step0: Prepare Data
+
+Please download our self-made [kinetics400_tiny](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) dataset and extract it to the `$MMACTION2/data` directory.
+The directory structure after extraction should be as follows:
+
+```
+mmaction2
+├── data
+│   ├── kinetics400_tiny
+│   │    ├── kinetics_tiny_train_video.txt
+│   │    ├── kinetics_tiny_val_video.txt
+│   │    ├── train
+│   │    │   ├── 27_CSXByd3s.mp4
+│   │    │   ├── 34XczvTaRiI.mp4
+│   │    │   ├── A-wiliK50Zw.mp4
+│   │    │   ├── ...
+│   │    └── val
+│   │       ├── 0pVGiAU6XEA.mp4
+│   │       ├── AQrbRSnRt8M.mp4
+│   │       ├── ...
+```
+
+Here are some examples from the annotation file `kinetics_tiny_train_video.txt`:
+
+```
+D32_1gwq35E.mp4 0
+iRuyZSKhHRg.mp4 1
+oXy-e_P_cAI.mp4 0
+34XczvTaRiI.mp4 1
+h2YqqUhnR34.mp4 0
+```
+
+Each line in the file represents the annotation of a video, where the first item denotes the video filename (e.g., `D32_1gwq35E.mp4`), and the second item represents the corresponding label (e.g., label `0` for `D32_1gwq35E.mp4`). In this dataset, there are only `two` categories.
+
+## Step1: Build a Pipeline
+
+In order to `decode`, `sample`, `resize`, `crop`, `format`, and `pack` the input video and corresponding annotation, we need to design a pipeline to handle these processes. Specifically, we design seven `Transform` classes to build this video processing pipeline. Note that all `Transform` classes in OpenMMLab must inherit from the `BaseTransform` class in `mmcv`, implement the abstract method `transform`, and be registered to the `TRANSFORMS` registry. For more detailed information about data transform, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_transform.html).
+
+```python
+import mmcv
+import decord
+import numpy as np
+from mmcv.transforms import TRANSFORMS, BaseTransform, to_tensor
+from mmaction.structures import ActionDataSample
+
+
+@TRANSFORMS.register_module()
+class VideoInit(BaseTransform):
+    def transform(self, results):
+        container = decord.VideoReader(results['filename'])
+        results['total_frames'] = len(container)
+        results['video_reader'] = container
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoSample(BaseTransform):
+    def __init__(self, clip_len, num_clips, test_mode=False):
+        self.clip_len = clip_len
+        self.num_clips = num_clips
+        self.test_mode = test_mode
+
+    def transform(self, results):
+        total_frames = results['total_frames']
+        interval = total_frames // self.clip_len
+
+        if self.test_mode:
+            # Make the sampling during testing deterministic
+            np.random.seed(42)
+
+        inds_of_all_clips = []
+        for i in range(self.num_clips):
+            bids = np.arange(self.clip_len) * interval
+            offset = np.random.randint(interval, size=bids.shape)
+            inds = bids + offset
+            inds_of_all_clips.append(inds)
+
+        results['frame_inds'] = np.concatenate(inds_of_all_clips)
+        results['clip_len'] = self.clip_len
+        results['num_clips'] = self.num_clips
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoDecode(BaseTransform):
+    def transform(self, results):
+        frame_inds = results['frame_inds']
+        container = results['video_reader']
+
+        imgs = container.get_batch(frame_inds).asnumpy()
+        imgs = list(imgs)
+
+        results['video_reader'] = None
+        del container
+
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoResize(BaseTransform):
+    def __init__(self, r_size):
+        self.r_size = (np.inf, r_size)
+
+    def transform(self, results):
+        img_h, img_w = results['img_shape']
+        new_w, new_h = mmcv.rescale_size((img_w, img_h), self.r_size)
+
+        imgs = [mmcv.imresize(img, (new_w, new_h))
+                for img in results['imgs']]
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoCrop(BaseTransform):
+    def __init__(self, c_size):
+        self.c_size = c_size
+
+    def transform(self, results):
+        img_h, img_w = results['img_shape']
+        center_x, center_y = img_w // 2, img_h // 2
+        x1, x2 = center_x - self.c_size // 2, center_x + self.c_size // 2
+        y1, y2 = center_y - self.c_size // 2, center_y + self.c_size // 2
+        imgs = [img[y1:y2, x1:x2] for img in results['imgs']]
+        results['imgs'] = imgs
+        results['img_shape'] = imgs[0].shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoFormat(BaseTransform):
+    def transform(self, results):
+        num_clips = results['num_clips']
+        clip_len = results['clip_len']
+        imgs = results['imgs']
+
+        # [num_clips*clip_len, H, W, C]
+        imgs = np.array(imgs)
+        # [num_clips, clip_len, H, W, C]
+        imgs = imgs.reshape((num_clips, clip_len) + imgs.shape[1:])
+        # [num_clips, C, clip_len, H, W]
+        imgs = imgs.transpose(0, 4, 1, 2, 3)
+
+        results['imgs'] = imgs
+        return results
+
+
+@TRANSFORMS.register_module()
+class VideoPack(BaseTransform):
+    def __init__(self, meta_keys=('img_shape', 'num_clips', 'clip_len')):
+        self.meta_keys = meta_keys
+
+    def transform(self, results):
+        packed_results = dict()
+        inputs = to_tensor(results['imgs'])
+        data_sample = ActionDataSample().set_gt_labels(results['label'])
+        metainfo = {k: results[k] for k in self.meta_keys if k in results}
+        data_sample.set_metainfo(metainfo)
+        packed_results['inputs'] = inputs
+        packed_results['data_samples'] = data_sample
+        return packed_results
+```
+
+Below, we provide a code snippet (using `D32_1gwq35E.mp4 0` from the annotation file) to demonstrate how to use the pipeline.
+
+```python
+import os.path as osp
+from mmengine.dataset import Compose
+
+pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+pipeline = Compose(pipeline_cfg)
+data_prefix = 'data/kinetics400_tiny/train'
+results = dict(filename=osp.join(data_prefix, 'D32_1gwq35E.mp4'), label=0)
+packed_results = pipeline(results)
+
+inputs = packed_results['inputs']
+data_sample = packed_results['data_samples']
+
+print('shape of the inputs: ', inputs.shape)
+
+# Get metainfo of the inputs
+print('image_shape: ', data_sample.img_shape)
+print('num_clips: ', data_sample.num_clips)
+print('clip_len: ', data_sample.clip_len)
+
+# Get label of the inputs
+print('label: ', data_sample.gt_labels.item)
+```
+
+```
+shape of the inputs:  torch.Size([1, 3, 16, 224, 224])
+image_shape:  (224, 224)
+num_clips:  1
+clip_len:  16
+label:  tensor([0])
+```
+
+## Step2: Build a Dataset and DataLoader
+
+All `Dataset` classes in OpenMMLab must inherit from the `BaseDataset` class in `mmengine`. We can customize annotation loading process by overriding the `load_data_list` method. Additionally, we can add more information to the `results` dict that is passed as input to the `pipeline` by overriding the `get_data_info` method. For more detailed information about `BaseDataset` class, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html).
+
+```python
+import os.path as osp
+from mmengine.fileio import list_from_file
+from mmengine.dataset import BaseDataset
+from mmaction.registry import DATASETS
+
+
+@DATASETS.register_module()
+class DatasetZelda(BaseDataset):
+    def __init__(self, ann_file, pipeline, data_root, data_prefix=dict(video=''),
+                 test_mode=False, modality='RGB', **kwargs):
+        self.modality = modality
+        super(DatasetZelda, self).__init__(ann_file=ann_file, pipeline=pipeline, data_root=data_root,
+                                           data_prefix=data_prefix, test_mode=test_mode,
+                                           **kwargs)
+
+    def load_data_list(self):
+        data_list = []
+        fin = list_from_file(self.ann_file)
+        for line in fin:
+            line_split = line.strip().split()
+            filename, label = line_split
+            label = int(label)
+            filename = osp.join(self.data_prefix['video'], filename)
+            data_list.append(dict(filename=filename, label=label))
+        return data_list
+
+    def get_data_info(self, idx: int) -> dict:
+        data_info = super().get_data_info(idx)
+        data_info['modality'] = self.modality
+        return data_info
+```
+
+Next, we will demonstrate how to use dataset and dataloader to index data. We will use the `Runner.build_dataloader` method to construct the dataloader. For more detailed information about dataloader, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/dataset.html#details-on-dataloader).
+
+```python
+from mmaction.registry import DATASETS
+
+train_pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+val_pipeline_cfg = [
+    dict(type='VideoInit'),
+    dict(type='VideoSample', clip_len=16, num_clips=5, test_mode=True),
+    dict(type='VideoDecode'),
+    dict(type='VideoResize', r_size=256),
+    dict(type='VideoCrop', c_size=224),
+    dict(type='VideoFormat'),
+    dict(type='VideoPack')
+]
+
+train_dataset_cfg = dict(
+    type='DatasetZelda',
+    ann_file='kinetics_tiny_train_video.txt',
+    pipeline=train_pipeline_cfg,
+    data_root='data/kinetics400_tiny/',
+    data_prefix=dict(video='train'))
+
+val_dataset_cfg = dict(
+    type='DatasetZelda',
+    ann_file='kinetics_tiny_val_video.txt',
+    pipeline=val_pipeline_cfg,
+    data_root='data/kinetics400_tiny/',
+    data_prefix=dict(video='val'))
+
+train_dataset = DATASETS.build(train_dataset_cfg)
+
+packed_results = train_dataset[0]
+
+inputs = packed_results['inputs']
+data_sample = packed_results['data_samples']
+
+print('shape of the inputs: ', inputs.shape)
+
+# Get metainfo of the inputs
+print('image_shape: ', data_sample.img_shape)
+print('num_clips: ', data_sample.num_clips)
+print('clip_len: ', data_sample.clip_len)
+
+# Get label of the inputs
+print('label: ', data_sample.gt_labels.item)
+
+from mmengine.runner import Runner
+
+BATCH_SIZE = 2
+
+train_dataloader_cfg = dict(
+    batch_size=BATCH_SIZE,
+    num_workers=0,
+    persistent_workers=False,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=train_dataset_cfg)
+
+val_dataloader_cfg = dict(
+    batch_size=BATCH_SIZE,
+    num_workers=0,
+    persistent_workers=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=val_dataset_cfg)
+
+train_data_loader = Runner.build_dataloader(dataloader=train_dataloader_cfg)
+val_data_loader = Runner.build_dataloader(dataloader=val_dataloader_cfg)
+
+batched_packed_results = next(iter(train_data_loader))
+
+batched_inputs = batched_packed_results['inputs']
+batched_data_sample = batched_packed_results['data_samples']
+
+assert len(batched_inputs) == BATCH_SIZE
+assert len(batched_data_sample) == BATCH_SIZE
+```
+
+The terminal output should be the same as the one shown in the [Step1: Build a Pipeline](#step1-build-a-pipeline).
+
+## Step3: Build a Recognizer
+
+Next, we will construct the `recognizer`, which mainly consists of three parts: `data preprocessor` for batching and normalizing the data, `backbone` for feature extraction, and `cls_head` for classification.
+
+The implementation of `data_preprocessor` is as follows:
+
+```python
+import torch
+from mmengine.model import BaseDataPreprocessor, stack_batch
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class DataPreprocessorZelda(BaseDataPreprocessor):
+    def __init__(self, mean, std):
+        super().__init__()
+
+        self.register_buffer(
+            'mean',
+            torch.tensor(mean, dtype=torch.float32).view(-1, 1, 1, 1),
+            False)
+        self.register_buffer(
+            'std',
+            torch.tensor(std, dtype=torch.float32).view(-1, 1, 1, 1),
+            False)
+
+    def forward(self, data, training=False):
+        data = self.cast_data(data)
+        inputs = data['inputs']
+        batch_inputs = stack_batch(inputs)  # Batching
+        batch_inputs = (batch_inputs - self.mean) / self.std  # Normalization
+        data['inputs'] = batch_inputs
+        return data
+```
+
+Here is the usage of data_preprocessor: feed the `batched_packed_results` obtained from the [Step2: Build a Dataset and DataLoader](#step2-build-a-dataset-and-dataloader) into the `data_preprocessor` for batching and normalization.
+
+```python
+from mmaction.registry import MODELS
+
+data_preprocessor_cfg = dict(
+    type='DataPreprocessorZelda',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375])
+
+data_preprocessor = MODELS.build(data_preprocessor_cfg)
+
+preprocessed_inputs = data_preprocessor(batched_packed_results)
+print(preprocessed_inputs['inputs'].shape)
+```
+
+```
+torch.Size([2, 1, 3, 16, 224, 224])
+```
+
+The implementations of `backbone`, `cls_head` and `recognizer` are as follows:
+
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModel, BaseModule, Sequential
+from mmengine.structures import LabelData
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class BackBoneZelda(BaseModule):
+    def __init__(self, init_cfg=None):
+        if init_cfg is None:
+            init_cfg = [dict(type='Kaiming', layer='Conv3d', mode='fan_out', nonlinearity="relu"),
+                        dict(type='Constant', layer='BatchNorm3d', val=1, bias=0)]
+
+        super(BackBoneZelda, self).__init__(init_cfg=init_cfg)
+
+        self.conv1 = Sequential(nn.Conv3d(3, 64, kernel_size=(3, 7, 7),
+                                          stride=(1, 2, 2), padding=(1, 3, 3)),
+                                nn.BatchNorm3d(64), nn.ReLU())
+        self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2),
+                                    padding=(0, 1, 1))
+
+        self.conv = Sequential(nn.Conv3d(64, 128, kernel_size=3, stride=2, padding=1),
+                               nn.BatchNorm3d(128), nn.ReLU())
+
+    def forward(self, imgs):
+        # imgs: [batch_size*num_views, 3, T, H, W]
+        # features: [batch_size*num_views, 128, T/2, H//8, W//8]
+        features = self.conv(self.maxpool(self.conv1(imgs)))
+        return features
+
+
+@MODELS.register_module()
+class ClsHeadZelda(BaseModule):
+    def __init__(self, num_classes, in_channels, dropout=0.5, average_clips='prob', init_cfg=None):
+        if init_cfg is None:
+            init_cfg = dict(type='Normal', layer='Linear', std=0.01)
+
+        super(ClsHeadZelda, self).__init__(init_cfg=init_cfg)
+
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.average_clips = average_clips
+
+        if dropout != 0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+        self.fc = nn.Linear(self.in_channels, self.num_classes)
+        self.pool = nn.AdaptiveAvgPool3d(1)
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, x):
+        N, C, T, H, W = x.shape
+        x = self.pool(x)
+        x = x.view(N, C)
+        assert x.shape[1] == self.in_channels
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        cls_scores = self.fc(x)
+        return cls_scores
+
+    def loss(self, feats, data_samples):
+        cls_scores = self(feats)
+        labels = torch.stack([x.gt_labels.item for x in data_samples])
+        labels = labels.squeeze()
+
+        if labels.shape == torch.Size([]):
+            labels = labels.unsqueeze(0)
+
+        loss_cls = self.loss_fn(cls_scores, labels)
+        return dict(loss_cls=loss_cls)
+
+    def predict(self, feats, data_samples):
+        cls_scores = self(feats)
+        num_views = cls_scores.shape[0] // len(data_samples)
+        # assert num_views == data_samples[0].num_clips
+        cls_scores = self.average_clip(cls_scores, num_views)
+
+        for ds, sc in zip(data_samples, cls_scores):
+            pred = LabelData(item=sc)
+            ds.pred_scores = pred
+        return data_samples
+
+    def average_clip(self, cls_scores, num_views):
+          if self.average_clips not in ['score', 'prob', None]:
+            raise ValueError(f'{self.average_clips} is not supported. '
+                             f'Currently supported ones are '
+                             f'["score", "prob", None]')
+
+          total_views = cls_scores.shape[0]
+          cls_scores = cls_scores.view(total_views // num_views, num_views, -1)
+
+          if self.average_clips is None:
+              return cls_scores
+          elif self.average_clips == 'prob':
+              cls_scores = F.softmax(cls_scores, dim=2).mean(dim=1)
+          elif self.average_clips == 'score':
+              cls_scores = cls_scores.mean(dim=1)
+
+          return cls_scores
+
+
+@MODELS.register_module()
+class RecognizerZelda(BaseModel):
+    def __init__(self, backbone, cls_head, data_preprocessor):
+        super().__init__(data_preprocessor=data_preprocessor)
+
+        self.backbone = MODELS.build(backbone)
+        self.cls_head = MODELS.build(cls_head)
+
+    def extract_feat(self, inputs):
+        inputs = inputs.view((-1, ) + inputs.shape[2:])
+        return self.backbone(inputs)
+
+    def loss(self, inputs, data_samples):
+        feats = self.extract_feat(inputs)
+        loss = self.cls_head.loss(feats, data_samples)
+        return loss
+
+    def predict(self, inputs, data_samples):
+        feats = self.extract_feat(inputs)
+        predictions = self.cls_head.predict(feats, data_samples)
+        return predictions
+
+    def forward(self, inputs, data_samples=None, mode='tensor'):
+        if mode == 'tensor':
+            return self.extract_feat(inputs)
+        elif mode == 'loss':
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode: {mode}')
+```
+
+The `init_cfg` is used for model weight initialization. For more information on model weight initialization, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/initialize.html). The usage of the above modules is as follows:
+
+```python
+import torch
+import copy
+from mmaction.registry import MODELS
+
+model_cfg = dict(
+    type='RecognizerZelda',
+    backbone=dict(type='BackBoneZelda'),
+    cls_head=dict(
+        type='ClsHeadZelda',
+        num_classes=2,
+        in_channels=128,
+        average_clips='prob'),
+    data_preprocessor = dict(
+        type='DataPreprocessorZelda',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375]))
+
+model = MODELS.build(model_cfg)
+
+# Train
+model.train()
+model.init_weights()
+data_batch_train = copy.deepcopy(batched_packed_results)
+data = model.data_preprocessor(data_batch_train, training=True)
+loss = model(**data, mode='loss')
+print('loss dict: ', loss)
+
+# Test
+with torch.no_grad():
+    model.eval()
+    data_batch_test = copy.deepcopy(batched_packed_results)
+    data = model.data_preprocessor(data_batch_test, training=False)
+    predictions = model(**data, mode='predict')
+print('Label of Sample[0]', predictions[0].gt_labels.item)
+print('Scores of Sample[0]', predictions[0].pred_scores.item)
+```
+
+```shell
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.0.weight - torch.Size([64, 3, 3, 7, 7]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.0.bias - torch.Size([64]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.1.weight - torch.Size([64]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.1.bias - torch.Size([64]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.0.weight - torch.Size([128, 64, 3, 3, 3]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.0.bias - torch.Size([128]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.1.weight - torch.Size([128]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.1.bias - torch.Size([128]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+cls_head.fc.weight - torch.Size([2, 128]):
+NormalInit: mean=0, std=0.01, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+cls_head.fc.bias - torch.Size([2]):
+NormalInit: mean=0, std=0.01, bias=0
+
+loss dict:  {'loss_cls': tensor(0.6853, grad_fn=<NllLossBackward0>)}
+Label of Sample[0] tensor([0])
+Scores of Sample[0] tensor([0.5240, 0.4760])
+```
+
+## Step4: Build a Evaluation Metric
+
+Note that all `Metric` classes in `OpenMMLab` must inherit from the `BaseMetric` class in `mmengine` and  implement the abstract methods, `process` and `compute_metrics`. For more information on evaluation, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html).
+
+```python
+import copy
+from collections import OrderedDict
+from mmengine.evaluator import BaseMetric
+from mmaction.evaluation import top_k_accuracy
+from mmaction.registry import METRICS
+
+
+@METRICS.register_module()
+class AccuracyMetric(BaseMetric):
+    def __init__(self, topk=(1, 5), collect_device='cpu', prefix='acc'):
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.topk = topk
+
+    def process(self, data_batch, data_samples):
+        data_samples = copy.deepcopy(data_samples)
+        for data_sample in data_samples:
+            result = dict()
+            scores = data_sample['pred_scores']['item'].cpu().numpy()
+            label = data_sample['gt_labels']['item'].item()
+            result['scores'] = scores
+            result['label'] = label
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        eval_results = OrderedDict()
+        labels = [res['label'] for res in results]
+        scores = [res['scores'] for res in results]
+        topk_acc = top_k_accuracy(scores, labels, self.topk)
+        for k, acc in zip(self.topk, topk_acc):
+            eval_results[f'topk{k}'] = acc
+        return eval_results
+```
+
+```python
+from mmaction.registry import METRICS
+
+metric_cfg = dict(type='AccuracyMetric', topk=(1, 5))
+
+metric = METRICS.build(metric_cfg)
+
+data_samples = [d.to_dict() for d in predictions]
+
+metric.process(batched_packed_results, data_samples)
+acc = metric.compute_metrics(metric.results)
+print(acc)
+```
+
+```shell
+OrderedDict([('topk1', 0.5), ('topk5', 1.0)])
+```
+
+## Step5: Train and Test with Native PyTorch
+
+```python
+import torch.optim as optim
+from mmengine import track_iter_progress
+
+
+device = 'cuda' # or 'cpu'
+max_epochs = 10
+
+optimizer = optim.Adam(model.parameters(), lr=0.01)
+
+for epoch in range(max_epochs):
+    model.train()
+    losses = []
+    for data_batch in track_iter_progress(train_data_loader):
+        data = model.data_preprocessor(data_batch, training=True)
+        loss_dict = model(**data, mode='loss')
+        loss = loss_dict['loss_cls']
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        losses.append(loss.item())
+
+    print(f'Epoch[{epoch}]: loss ', sum(losses) / len(train_data_loader))
+
+    with torch.no_grad():
+        model.eval()
+        for data_batch in track_iter_progress(val_data_loader):
+            data = model.data_preprocessor(data_batch, training=False)
+            predictions = model(**data, mode='predict')
+            data_samples = [d.to_dict() for d in predictions]
+            metric.process(data_batch, data_samples)
+
+        acc = metric.acc = metric.compute_metrics(metric.results)
+        for name, topk in acc.items():
+            print(f'{name}: ', topk)
+```
+
+## Step6: Train and Test with MMEngine (Recommended)
+
+For more details on training and testing, you can refer to [MMAction2 Tutorial](https://mmaction2.readthedocs.io/en/1.x/user_guides/4_train_test.html). For more information on `Runner`, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html).
+
+```python
+from mmengine.runner import Runner
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1)
+val_cfg = dict(type='ValLoop')
+
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.01))
+
+runner = Runner(model=model_cfg, work_dir='./work_dirs/guide',
+                train_dataloader=train_dataloader_cfg,
+                train_cfg=train_cfg,
+                val_dataloader=val_dataloader_cfg,
+                val_cfg=val_cfg,
+                optim_wrapper=optim_wrapper,
+                val_evaluator=[metric_cfg],
+                default_scope='mmaction')
+runner.train()
+```
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 59e3e49b53..392b64ef45 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -8,6 +8,7 @@ You can switch between Chinese and English documents in the lower-left corner of
    :caption: Get Started
 
    get_started.md
+   guide_to_framework.md
 
 .. toctree::
    :maxdepth: 1

From 7754e85d95a1994eec44c1e9f3a0fb0291778e26 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Thu, 6 Apr 2023 11:36:41 +0800
Subject: [PATCH 26/36] [fix] fix channel order when show video (#2308)

---
 mmaction/registry.py                        | 5 +++++
 mmaction/visualization/action_visualizer.py | 5 ++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/mmaction/registry.py b/mmaction/registry.py
index 28d237daa8..6d7d831db1 100644
--- a/mmaction/registry.py
+++ b/mmaction/registry.py
@@ -9,6 +9,7 @@
 from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
 from mmengine.registry import DATASETS as MMENGINE_DATASETS
 from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR
+from mmengine.registry import FUNCTIONS as MMENGINE_FUNCTION
 from mmengine.registry import HOOKS as MMENGINE_HOOKS
 from mmengine.registry import INFERENCERS as MMENGINE_INFERENCERS
 from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS
@@ -127,3 +128,7 @@
     'inferencer',
     parent=MMENGINE_INFERENCERS,
     locations=['mmaction.apis.inferencers'])
+
+# manage function
+FUNCTION = Registry(
+    'function', parent=MMENGINE_FUNCTION, locations=['mmaction.mmengine'])
diff --git a/mmaction/visualization/action_visualizer.py b/mmaction/visualization/action_visualizer.py
index 48c595fd5b..6fc5ae2123 100644
--- a/mmaction/visualization/action_visualizer.py
+++ b/mmaction/visualization/action_visualizer.py
@@ -268,7 +268,10 @@ def add_datasample(self,
                     wait_time = frame_wait_time
                 else:
                     wait_time = wait_time
-                self.show(drawn_img, win_name=frame_name, wait_time=wait_time)
+                self.show(
+                    drawn_img[:, :, ::-1],
+                    win_name=frame_name,
+                    wait_time=wait_time)
 
         resulted_video = np.array(resulted_video)
         if out_path is not None:

From d8decfe78034174d78f5edf688ee0ac349abb3ee Mon Sep 17 00:00:00 2001
From: Kai Hu <kaiorhu@gmail.com>
Date: Wed, 5 Apr 2023 23:37:26 -0400
Subject: [PATCH 27/36] [Refactor] speed up LFB training (#2294)

---
 configs/detection/lfb/README.md               |  15 ++-
 configs/detection/lfb/metafile.yml            |   2 +-
 .../lfb/slowonly-lfb-infer_r50_ava21-rgb.py   | 114 ++++++++++++++++++
 ...etrained-r50_8xb12-4x16x1-20e_ava21-rgb.py |  59 ++++++++-
 mmaction/models/roi_heads/shared_heads/lfb.py |  24 ++--
 5 files changed, 190 insertions(+), 24 deletions(-)
 create mode 100644 configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py

diff --git a/configs/detection/lfb/README.md b/configs/detection/lfb/README.md
index 1d33a7d7e9..dabb3a1b46 100644
--- a/configs/detection/lfb/README.md
+++ b/configs/detection/lfb/README.md
@@ -22,7 +22,7 @@ To understand the world, we humans constantly need to relate the present to the
 
 | frame sampling strategy | resolution | gpus |               backbone               |   pretrain   |  mAP  | gpu_mem(M) |               config                |               ckpt                |               log                |
 | :---------------------: | :--------: | :--: | :----------------------------------: | :----------: | :---: | :--------: | :---------------------------------: | :-------------------------------: | :------------------------------: |
-|         4x16x1          |    raw     |  8   | SlowOnly ResNet50 (with Nonlocal LFB) | Kinetics-400 | 24.05 |    8620    | [config](/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4c5b9f25.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log) |
+|         4x16x1          |    raw     |  8   | SlowOnly ResNet50 (with Nonlocal LFB) | Kinetics-400 | 24.11 |    8620    | [config](/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4c5b9f25.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log) |
 |         4x16x1          |    raw     |  8   |   SlowOnly ResNet50 (with Max LFB)   | Kinetics-400 | 22.15 |    8425    | [config](/configs/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4963135b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log) |
 
 Note:
@@ -33,8 +33,7 @@ Note:
 2. We use `slowonly_r50_4x16x1` instead of `I3D-R50-NL` in the original paper as the backbone of LFB, but we have achieved the similar improvement: (ours: 20.1 -> 24.05 vs. author: 22.1 -> 25.8).
 3. Because the long-term features are randomly sampled in testing, the test accuracy may have some differences.
 4. Before train or test lfb, you need to infer feature bank with the [slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py](/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py). For more details on infer feature bank, you can refer to [Train](#Train) part.
-5. You can also dowonload long-term feature bank from [AVA_train_val_float32_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float32_lfb.rar) or [AVA_train_val_float16_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float16_lfb.rar), and then put them on `lfb_prefix_path`.
-6. The ROIHead now supports single-label classification (i.e. the network outputs at most
+5. The ROIHead now supports single-label classification (i.e. the network outputs at most
    one-label per actor). This can be done by (a) setting multilabel=False during training and
    the test_cfg.rcnn.action_thr for testing.
 
@@ -42,7 +41,7 @@ Note:
 
 ### a. Infer long-term feature bank for training
 
-Before train or test lfb, you need to infer long-term feature bank first.
+Before train or test lfb, you need to infer long-term feature bank first. You can also dowonload long-term feature bank from [AVA_train_val_float32_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float32_lfb.rar) or [AVA_train_val_float16_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float16_lfb.rar), and then put them on `lfb_prefix_path`. In this case, you can skip this step.
 
 Specifically, run the test on the training, validation, testing dataset with the config file [slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py](/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py) (The config file will only infer the feature bank of training dataset and you need set `dataset_mode = 'val'` to infer the feature bank of validation dataset in the config file.), and the shared head [LFBInferHead](/mmaction/models/roi_heads/shared_heads/lfb_infer_head.py) will generate the feature bank.
 
@@ -52,12 +51,12 @@ You can use the following command to infer feature bank of AVA training and vali
 
 ```shell
 # set `dataset_mode = 'train'` in lfb_slowonly_r50_ava_infer.py
-python tools/test.py slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py \
-    checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
+python tools/test.py configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py \
+    checkpoints/YOUR_BASELINE_CHECKPOINT.pth
 
 # set `dataset_mode = 'val'` in lfb_slowonly_r50_ava_infer.py
-python tools/test.py slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py \
-    checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
+python tools/test.py configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py \
+    checkpoints/YOUR_BASELINE_CHECKPOINT.pth
 ```
 
 We use [slowonly_r50_4x16x1 checkpoint](https://download.openmmlab.com/mmaction/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb_20201217-40061d5f.pth) from [slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb](/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) to infer feature bank.
diff --git a/configs/detection/lfb/metafile.yml b/configs/detection/lfb/metafile.yml
index 055032ad18..c1de15768f 100644
--- a/configs/detection/lfb/metafile.yml
+++ b/configs/detection/lfb/metafile.yml
@@ -22,7 +22,7 @@ Models:
       - Dataset: AVA v2.1
         Task: Action Detection
         Metrics:
-              mAP: 24.05
+              mAP: 24.11
     Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4c5b9f25.pth
 
diff --git a/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py
new file mode 100644
index 0000000000..278d87c1e1
--- /dev/null
+++ b/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py
@@ -0,0 +1,114 @@
+# This config is used to generate long-term feature bank.
+_base_ = '../../_base_/default_runtime.py'
+
+# model settings
+lfb_prefix_path = 'data/ava/lfb_half'
+dataset_mode = 'train'  # ['train', 'val', 'test']
+
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+       'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
+       'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+       'kinetics400-rgb_20220901-e7b65fad.pth')
+
+model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1)),
+    roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2048,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5),
+        shared_head=dict(
+            type='LFBInferHead',
+            lfb_prefix_path=lfb_prefix_path,
+            dataset_mode=dataset_mode,
+            use_half_precision=True)),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        _scope_='mmaction',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
+
+# dataset settings
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_infer = f'{anno_root}/ava_{dataset_mode}_v2.1.csv'
+
+exclude_file_infer = (
+    f'{anno_root}/ava_{dataset_mode}_excluded_timestamps_v2.1.csv')
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_infer = (
+    f'{anno_root}/ava_dense_proposals_{dataset_mode}.FAIR.recall_93.9.pkl')
+
+infer_pipeline = [
+    dict(
+        type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='FormatShape', input_format='NCTHW', collapse=True),
+    dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_infer,
+        exclude_file=exclude_file_infer,
+        pipeline=infer_pipeline,
+        label_file=label_file,
+        proposal_file=proposal_file_infer,
+        data_prefix=dict(img=data_root),
+        person_det_score_thr=0.9,
+        test_mode=True))
+
+test_cfg = dict(type='TestLoop')
+test_evaluator = dict(
+    type='AVAMetric',
+    ann_file=ann_file_infer,
+    label_file=label_file,
+    exclude_file=exclude_file_infer,
+    action_thr=0.0)
diff --git a/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py
index 2da2bd3a7c..9d323ad0e4 100644
--- a/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py
+++ b/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py
@@ -1,6 +1,4 @@
-_base_ = [
-    '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
+_base_ = '../../_base_/default_runtime.py'
 
 # model settings
 lfb_prefix_path = 'data/ava/lfb_half'
@@ -10,8 +8,39 @@
 lfb_channels = 2048
 dataset_modes = ('train', 'val')
 
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+       'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
+       'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+       'kinetics400-rgb_20220901-e7b65fad.pth')
+
 model = dict(
+    type='FastRCNN',
+    _scope_='mmdet',
+    init_cfg=dict(type='Pretrained', checkpoint=url),
+    backbone=dict(
+        type='mmaction.ResNet3dSlowOnly',
+        depth=50,
+        pretrained=None,
+        pretrained2d=False,
+        lateral=False,
+        num_stages=4,
+        conv1_kernel=(1, 7, 7),
+        conv1_stride_t=1,
+        pool1_stride_t=1,
+        spatial_strides=(1, 2, 2, 1)),
     roi_head=dict(
+        type='AVARoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor3D',
+            roi_layer_type='RoIAlign',
+            output_size=8,
+            with_temporal_pool=True),
+        bbox_head=dict(
+            type='BBoxHeadAVA',
+            in_channels=2560,
+            num_classes=81,
+            multilabel=True,
+            dropout_ratio=0.5),
         shared_head=dict(
             type='FBOHead',
             lfb_cfg=dict(
@@ -31,8 +60,28 @@
                 num_non_local_layers=2,
                 st_feat_dropout_ratio=0.2,
                 lt_feat_dropout_ratio=0.2,
-                pre_activate=True)),
-        bbox_head=dict(in_channels=2560)))
+                pre_activate=True))),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        _scope_='mmaction',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='NCTHW'),
+    train_cfg=dict(
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssignerAVA',
+                pos_iou_thr=0.9,
+                neg_iou_thr=0.9,
+                min_pos_iou=0.9),
+            sampler=dict(
+                type='RandomSampler',
+                num=32,
+                pos_fraction=1,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=1.0)),
+    test_cfg=dict(rcnn=None))
 
 dataset_type = 'AVADataset'
 data_root = 'data/ava/rawframes'
diff --git a/mmaction/models/roi_heads/shared_heads/lfb.py b/mmaction/models/roi_heads/shared_heads/lfb.py
index e8e7afff2a..986c784403 100644
--- a/mmaction/models/roi_heads/shared_heads/lfb.py
+++ b/mmaction/models/roi_heads/shared_heads/lfb.py
@@ -4,7 +4,6 @@
 import os.path as osp
 import warnings
 
-import numpy as np
 import torch
 import torch.distributed as dist
 from mmengine.dist import get_dist_info
@@ -130,6 +129,13 @@ def load_lfb(self, map_location):
                 osp.join(self.lfb_prefix_path, f'lfb_{dataset_mode}.pkl'))
             print(f'Loading LFB from {lfb_path}...')
             self.lfb.update(torch.load(lfb_path, map_location=map_location))
+
+        for video_id in self.lfb:
+            video_features = self.lfb[video_id]
+            for sec in video_features:
+                if isinstance(video_features[sec], (list, tuple)):
+                    video_features[sec] = torch.stack(video_features[sec])
+            self.lfb[video_id] = video_features
         print(f'LFB has been loaded on {map_location}.')
 
     def load_lfb_on_lmdb(self):
@@ -162,22 +168,20 @@ def sample_long_term_features(self, video_id, timestamp):
         # Sample long term features.
         window_size, K = self.window_size, self.max_num_sampled_feat
         start = timestamp - (window_size // 2)
-        lt_feats = torch.zeros(window_size * K, self.lfb_channels)
+        lt_feats = torch.zeros(window_size, K, self.lfb_channels)
 
         for idx, sec in enumerate(range(start, start + window_size)):
             if sec in video_features:
                 # `num_feat` is the number of roi features in this second.
-                num_feat = len(video_features[sec])
-                num_feat_sampled = min(num_feat, K)
-                # Sample some roi features randomly.
-                random_lfb_indices = np.random.choice(
-                    range(num_feat), num_feat_sampled, replace=False)
+                feat = video_features[sec]
+                num_feat = feat.shape[0]
 
-                for k, rand_idx in enumerate(random_lfb_indices):
-                    lt_feats[idx * K + k] = video_features[sec][rand_idx]
+                # Sample some roi features randomly.
+                random_lfb_indices = torch.randperm(num_feat)[:K]
+                lt_feats[idx, :num_feat] = feat[random_lfb_indices]
 
         # [window_size * max_num_sampled_feat, lfb_channels]
-        return lt_feats
+        return lt_feats.reshape(-1, self.lfb_channels)
 
     def __getitem__(self, img_key):
         """Sample long term features like `lfb['0f39OWEqJ24,0902']` where `lfb`

From b9aa560875ee126df616caf4c458a339a5a5305b Mon Sep 17 00:00:00 2001
From: Haodong Duan <dhd.efz@gmail.com>
Date: Thu, 6 Apr 2023 11:42:46 +0800
Subject: [PATCH 28/36] [Refactoring] Faster AVA Evaluation using
 multiprocessing (#2146)

---
 .../object_detection_evaluation.py            | 574 ------------------
 .../ava_evaluation/per_image_evaluation.py    | 358 -----------
 .../ava_evaluation/standard_fields.py         | 115 ----
 mmaction/evaluation/functional/ava_utils.py   | 162 +++--
 mmaction/evaluation/metrics/ava_metric.py     |   1 +
 5 files changed, 111 insertions(+), 1099 deletions(-)
 delete mode 100644 mmaction/evaluation/functional/ava_evaluation/object_detection_evaluation.py
 delete mode 100644 mmaction/evaluation/functional/ava_evaluation/per_image_evaluation.py
 delete mode 100644 mmaction/evaluation/functional/ava_evaluation/standard_fields.py

diff --git a/mmaction/evaluation/functional/ava_evaluation/object_detection_evaluation.py b/mmaction/evaluation/functional/ava_evaluation/object_detection_evaluation.py
deleted file mode 100644
index 1886521485..0000000000
--- a/mmaction/evaluation/functional/ava_evaluation/object_detection_evaluation.py
+++ /dev/null
@@ -1,574 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""object_detection_evaluation module.
-
-ObjectDetectionEvaluation is a class which manages ground truth information of
-a object detection dataset, and computes frequently used detection metrics such
-as Precision, Recall, CorLoc of the provided detection results.
-It supports the following operations:
-1) Add ground truth information of images sequentially.
-2) Add detection result of images sequentially.
-3) Evaluate detection metrics on already inserted detection results.
-4) Write evaluation result into a pickle file for future processing or
-   visualization.
-
-Note: This module operates on numpy boxes and box lists.
-"""
-
-import collections
-import logging
-import warnings
-from abc import ABCMeta, abstractmethod
-from collections import defaultdict
-
-import numpy as np
-
-from . import metrics, per_image_evaluation, standard_fields
-
-
-class DetectionEvaluator:
-    """Interface for object detection evaluation classes.
-
-    Example usage of the Evaluator:
-    ------------------------------
-    evaluator = DetectionEvaluator(categories)
-
-    # Detections and groundtruth for image 1.
-    evaluator.add_single_groundtruth_image_info(...)
-    evaluator.add_single_detected_image_info(...)
-
-    # Detections and groundtruth for image 2.
-    evaluator.add_single_groundtruth_image_info(...)
-    evaluator.add_single_detected_image_info(...)
-
-    metrics_dict = evaluator.evaluate()
-    """
-
-    __metaclass__ = ABCMeta
-
-    def __init__(self, categories):
-        """Constructor.
-
-        Args:
-            categories: A list of dicts, each of which has the following keys -
-                'id': (required) an integer id uniquely identifying this
-                    category.
-                'name': (required) string representing category name e.g.,
-                    'cat', 'dog'.
-        """
-        self._categories = categories
-
-    @abstractmethod
-    def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
-        """Adds groundtruth for a single image to be used for evaluation.
-
-        Args:
-            image_id: A unique string/integer identifier for the image.
-            groundtruth_dict: A dictionary of groundtruth numpy arrays required
-                for evaluations.
-        """
-
-    @abstractmethod
-    def add_single_detected_image_info(self, image_id, detections_dict):
-        """Adds detections for a single image to be used for evaluation.
-
-        Args:
-            image_id: A unique string/integer identifier for the image.
-            detections_dict: A dictionary of detection numpy arrays required
-                for evaluation.
-        """
-
-    @abstractmethod
-    def evaluate(self):
-        """Evaluates detections and returns a dictionary of metrics."""
-
-    @abstractmethod
-    def clear(self):
-        """Clears the state to prepare for a fresh evaluation."""
-
-
-class ObjectDetectionEvaluator(DetectionEvaluator):
-    """A class to evaluate detections."""
-
-    def __init__(self,
-                 categories,
-                 matching_iou_threshold=0.5,
-                 evaluate_corlocs=False,
-                 metric_prefix=None,
-                 use_weighted_mean_ap=False,
-                 evaluate_masks=False):
-        """Constructor.
-
-        Args:
-            categories: A list of dicts, each of which has the following keys -
-                'id': (required) an integer id uniquely identifying this
-                    category.
-                'name': (required) string representing category name e.g.,
-                    'cat', 'dog'.
-            matching_iou_threshold: IOU threshold to use for matching
-                groundtruth boxes to detection boxes.
-            evaluate_corlocs: (optional) boolean which determines if corloc
-                scores are to be returned or not.
-            metric_prefix: (optional) string prefix for metric name; if None,
-                no prefix is used.
-            use_weighted_mean_ap: (optional) boolean which determines if the
-                mean average precision is computed directly from the scores and
-                tp_fp_labels of all classes.
-            evaluate_masks: If False, evaluation will be performed based on
-                boxes. If True, mask evaluation will be performed instead.
-
-        Raises:
-            ValueError: If the category ids are not 1-indexed.
-        """
-        super(ObjectDetectionEvaluator, self).__init__(categories)
-        self._num_classes = max([cat['id'] for cat in categories])
-        if min(cat['id'] for cat in categories) < 1:
-            raise ValueError('Classes should be 1-indexed.')
-        self._matching_iou_threshold = matching_iou_threshold
-        self._use_weighted_mean_ap = use_weighted_mean_ap
-        self._label_id_offset = 1
-        self._evaluate_masks = evaluate_masks
-        self._evaluation = ObjectDetectionEvaluation(
-            num_groundtruth_classes=self._num_classes,
-            matching_iou_threshold=self._matching_iou_threshold,
-            use_weighted_mean_ap=self._use_weighted_mean_ap,
-            label_id_offset=self._label_id_offset,
-        )
-        self._image_ids = set([])
-        self._evaluate_corlocs = evaluate_corlocs
-        self._metric_prefix = (metric_prefix + '_') if metric_prefix else ''
-
-    def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
-        """Adds groundtruth for a single image to be used for evaluation.
-
-        Args:
-            image_id: A unique string/integer identifier for the image.
-            groundtruth_dict: A dictionary containing -
-                standard_fields.InputDataFields.groundtruth_boxes: float32
-                    numpy array of shape [num_boxes, 4] containing `num_boxes`
-                    groundtruth boxes of the format [ymin, xmin, ymax, xmax] in
-                    absolute image coordinates.
-                standard_fields.InputDataFields.groundtruth_classes: integer
-                    numpy array of shape [num_boxes] containing 1-indexed
-                    groundtruth classes for the boxes.
-                standard_fields.InputDataFields.groundtruth_instance_masks:
-                    Optional numpy array of shape [num_boxes, height, width]
-                    with values in {0, 1}.
-
-        Raises:
-            ValueError: On adding groundtruth for an image more than once. Will
-                also raise error if instance masks are not in groundtruth
-                dictionary.
-        """
-        if image_id in self._image_ids:
-            raise ValueError(
-                'Image with id {} already added.'.format(image_id))
-
-        groundtruth_classes = (
-            groundtruth_dict[
-                standard_fields.InputDataFields.groundtruth_classes] -
-            self._label_id_offset)
-
-        groundtruth_masks = None
-        if self._evaluate_masks:
-            if (standard_fields.InputDataFields.groundtruth_instance_masks
-                    not in groundtruth_dict):
-                raise ValueError(
-                    'Instance masks not in groundtruth dictionary.')
-            groundtruth_masks = groundtruth_dict[
-                standard_fields.InputDataFields.groundtruth_instance_masks]
-        self._evaluation.add_single_ground_truth_image_info(
-            image_key=image_id,
-            groundtruth_boxes=groundtruth_dict[
-                standard_fields.InputDataFields.groundtruth_boxes],
-            groundtruth_class_labels=groundtruth_classes,
-            groundtruth_masks=groundtruth_masks,
-        )
-        self._image_ids.update([image_id])
-
-    def add_single_detected_image_info(self, image_id, detections_dict):
-        """Adds detections for a single image to be used for evaluation.
-
-        Args:
-            image_id: A unique string/integer identifier for the image.
-            detections_dict: A dictionary containing -
-                standard_fields.DetectionResultFields.detection_boxes: float32
-                    numpy array of shape [num_boxes, 4] containing `num_boxes`
-                    detection boxes of the format [ymin, xmin, ymax, xmax] in
-                    absolute image coordinates.
-                standard_fields.DetectionResultFields.detection_scores: float32
-                    numpy array of shape [num_boxes] containing detection
-                    scores for the boxes.
-                standard_fields.DetectionResultFields.detection_classes:
-                    integer numpy array of shape [num_boxes] containing
-                    1-indexed detection classes for the boxes.
-                standard_fields.DetectionResultFields.detection_masks: uint8
-                    numpy array of shape [num_boxes, height, width] containing
-                    `num_boxes` masks of values ranging between 0 and 1.
-
-        Raises:
-            ValueError: If detection masks are not in detections dictionary.
-        """
-        detection_classes = (
-            detections_dict[
-                standard_fields.DetectionResultFields.detection_classes] -
-            self._label_id_offset)
-        detection_masks = None
-        if self._evaluate_masks:
-            if (standard_fields.DetectionResultFields.detection_masks
-                    not in detections_dict):
-                raise ValueError(
-                    'Detection masks not in detections dictionary.')
-            detection_masks = detections_dict[
-                standard_fields.DetectionResultFields.detection_masks]
-        self._evaluation.add_single_detected_image_info(
-            image_key=image_id,
-            detected_boxes=detections_dict[
-                standard_fields.DetectionResultFields.detection_boxes],
-            detected_scores=detections_dict[
-                standard_fields.DetectionResultFields.detection_scores],
-            detected_class_labels=detection_classes,
-            detected_masks=detection_masks,
-        )
-
-    @staticmethod
-    def create_category_index(categories):
-        """Creates dictionary of COCO compatible categories keyed by category
-        id.
-
-        Args:
-            categories: a list of dicts, each of which has the following keys:
-                'id': (required) an integer id uniquely identifying this
-                    category.
-                'name': (required) string representing category name
-                    e.g., 'cat', 'dog', 'pizza'.
-
-        Returns:
-            category_index: a dict containing the same entries as categories,
-                but keyed by the 'id' field of each category.
-        """
-        category_index = {}
-        for cat in categories:
-            category_index[cat['id']] = cat
-        return category_index
-
-    def evaluate(self):
-        """Compute evaluation result.
-
-        Returns:
-            A dictionary of metrics with the following fields -
-
-            1. summary_metrics:
-                'Precision/mAP@<matching_iou_threshold>IOU': mean average
-                precision at the specified IOU threshold
-
-            2. per_category_ap: category specific results with keys of the form
-               'PerformanceByCategory/mAP@<matching_iou_threshold>IOU/category'
-        """
-        (per_class_ap, mean_ap, _, _, per_class_corloc,
-         mean_corloc) = self._evaluation.evaluate()
-
-        metric = f'mAP@{self._matching_iou_threshold}IOU'
-        pascal_metrics = {self._metric_prefix + metric: mean_ap}
-        if self._evaluate_corlocs:
-            pascal_metrics[self._metric_prefix +
-                           'Precision/meanCorLoc@{}IOU'.format(
-                               self._matching_iou_threshold)] = mean_corloc
-        category_index = self.create_category_index(self._categories)
-        for idx in range(per_class_ap.size):
-            if idx + self._label_id_offset in category_index:
-                display_name = (
-                    self._metric_prefix +
-                    'PerformanceByCategory/AP@{}IOU/{}'.format(
-                        self._matching_iou_threshold,
-                        category_index[idx + self._label_id_offset]['name'],
-                    ))
-                pascal_metrics[display_name] = per_class_ap[idx]
-
-                # Optionally add CorLoc metrics.classes
-                if self._evaluate_corlocs:
-                    display_name = (
-                        self._metric_prefix +
-                        'PerformanceByCategory/CorLoc@{}IOU/{}'.format(
-                            self._matching_iou_threshold,
-                            category_index[idx +
-                                           self._label_id_offset]['name'],
-                        ))
-                    pascal_metrics[display_name] = per_class_corloc[idx]
-
-        return pascal_metrics
-
-    def clear(self):
-        """Clears the state to prepare for a fresh evaluation."""
-        self._evaluation = ObjectDetectionEvaluation(
-            num_groundtruth_classes=self._num_classes,
-            matching_iou_threshold=self._matching_iou_threshold,
-            use_weighted_mean_ap=self._use_weighted_mean_ap,
-            label_id_offset=self._label_id_offset,
-        )
-        self._image_ids.clear()
-
-
-class PascalDetectionEvaluator(ObjectDetectionEvaluator):
-    """A class to evaluate detections using PASCAL metrics."""
-
-    def __init__(self, categories, matching_iou_threshold=0.5):
-        super(PascalDetectionEvaluator, self).__init__(
-            categories,
-            matching_iou_threshold=matching_iou_threshold,
-            evaluate_corlocs=False,
-            use_weighted_mean_ap=False,
-        )
-
-
-ObjectDetectionEvalMetrics = collections.namedtuple(
-    'ObjectDetectionEvalMetrics',
-    [
-        'average_precisions',
-        'mean_ap',
-        'precisions',
-        'recalls',
-        'corlocs',
-        'mean_corloc',
-    ],
-)
-
-
-class ObjectDetectionEvaluation:
-    """Internal implementation of Pascal object detection metrics."""
-
-    def __init__(self,
-                 num_groundtruth_classes,
-                 matching_iou_threshold=0.5,
-                 nms_iou_threshold=1.0,
-                 nms_max_output_boxes=10000,
-                 use_weighted_mean_ap=False,
-                 label_id_offset=0):
-        if num_groundtruth_classes < 1:
-            raise ValueError(
-                'Need at least 1 groundtruth class for evaluation.')
-
-        self.per_image_eval = per_image_evaluation.PerImageEvaluation(
-            num_groundtruth_classes=num_groundtruth_classes,
-            matching_iou_threshold=matching_iou_threshold,
-        )
-        self.num_class = num_groundtruth_classes
-        self.use_weighted_mean_ap = use_weighted_mean_ap
-        self.label_id_offset = label_id_offset
-
-        self.groundtruth_boxes = {}
-        self.groundtruth_class_labels = {}
-        self.groundtruth_masks = {}
-        self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=int)
-        self.num_gt_imgs_per_class = np.zeros(self.num_class, dtype=int)
-
-        self._initialize_detections()
-
-    def _initialize_detections(self):
-        self.detection_keys = set()
-        self.scores_per_class = [[] for _ in range(self.num_class)]
-        self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)]
-        self.num_images_correctly_detected_per_class = np.zeros(self.num_class)
-        self.average_precision_per_class = np.empty(
-            self.num_class, dtype=float)
-        self.average_precision_per_class.fill(np.nan)
-        self.precisions_per_class = []
-        self.recalls_per_class = []
-        self.corloc_per_class = np.ones(self.num_class, dtype=float)
-
-    def clear_detections(self):
-        self._initialize_detections()
-
-    def add_single_ground_truth_image_info(self,
-                                           image_key,
-                                           groundtruth_boxes,
-                                           groundtruth_class_labels,
-                                           groundtruth_masks=None):
-        """Adds groundtruth for a single image to be used for evaluation.
-
-        Args:
-            image_key: A unique string/integer identifier for the image.
-            groundtruth_boxes: float32 numpy array of shape [num_boxes, 4]
-                containing `num_boxes` groundtruth boxes of the format
-                [ymin, xmin, ymax, xmax] in absolute image coordinates.
-            groundtruth_class_labels: integer numpy array of shape [num_boxes]
-                containing 0-indexed groundtruth classes for the boxes.
-            groundtruth_masks: uint8 numpy array of shape
-                [num_boxes, height, width] containing `num_boxes` groundtruth
-                masks. The mask values range from 0 to 1.
-        """
-        if image_key in self.groundtruth_boxes:
-            warnings.warn(('image %s has already been added to the ground '
-                           'truth database.'), image_key)
-            return
-
-        self.groundtruth_boxes[image_key] = groundtruth_boxes
-        self.groundtruth_class_labels[image_key] = groundtruth_class_labels
-        self.groundtruth_masks[image_key] = groundtruth_masks
-
-        self._update_ground_truth_statistics(groundtruth_class_labels)
-
-    def add_single_detected_image_info(self,
-                                       image_key,
-                                       detected_boxes,
-                                       detected_scores,
-                                       detected_class_labels,
-                                       detected_masks=None):
-        """Adds detections for a single image to be used for evaluation.
-
-        Args:
-            image_key: A unique string/integer identifier for the image.
-            detected_boxes: float32 numpy array of shape [num_boxes, 4]
-                containing `num_boxes` detection boxes of the format
-                [ymin, xmin, ymax, xmax] in absolute image coordinates.
-            detected_scores: float32 numpy array of shape [num_boxes]
-                containing detection scores for the boxes.
-            detected_class_labels: integer numpy array of shape [num_boxes]
-                containing 0-indexed detection classes for the boxes.
-            detected_masks: np.uint8 numpy array of shape
-                [num_boxes, height, width] containing `num_boxes` detection
-                masks with values ranging between 0 and 1.
-
-        Raises:
-            ValueError: if the number of boxes, scores and class labels differ
-                in length.
-        """
-        if len(detected_boxes) != len(detected_scores) or len(
-                detected_boxes) != len(detected_class_labels):
-            raise ValueError(
-                'detected_boxes, detected_scores and '
-                'detected_class_labels should all have same lengths. Got'
-                '[%d, %d, %d]' % len(detected_boxes),
-                len(detected_scores),
-                len(detected_class_labels),
-            )
-
-        if image_key in self.detection_keys:
-            warnings.warn(('image %s has already been added to the ground '
-                           'truth database.'), image_key)
-            return
-
-        self.detection_keys.add(image_key)
-        if image_key in self.groundtruth_boxes:
-            groundtruth_boxes = self.groundtruth_boxes[image_key]
-            groundtruth_class_labels = self.groundtruth_class_labels[image_key]
-            # Masks are popped instead of look up. The reason is that we do not
-            # want to keep all masks in memory which can cause memory overflow.
-            groundtruth_masks = self.groundtruth_masks.pop(image_key)
-        else:
-            groundtruth_boxes = np.empty(shape=[0, 4], dtype=float)
-            groundtruth_class_labels = np.array([], dtype=int)
-            if detected_masks is None:
-                groundtruth_masks = None
-            else:
-                groundtruth_masks = np.empty(shape=[0, 1, 1], dtype=float)
-        (
-            scores,
-            tp_fp_labels,
-        ) = self.per_image_eval.compute_object_detection_metrics(
-            detected_boxes=detected_boxes,
-            detected_scores=detected_scores,
-            detected_class_labels=detected_class_labels,
-            groundtruth_boxes=groundtruth_boxes,
-            groundtruth_class_labels=groundtruth_class_labels,
-            detected_masks=detected_masks,
-            groundtruth_masks=groundtruth_masks,
-        )
-
-        for i in range(self.num_class):
-            if scores[i].shape[0] > 0:
-                self.scores_per_class[i].append(scores[i])
-                self.tp_fp_labels_per_class[i].append(tp_fp_labels[i])
-
-    def _update_ground_truth_statistics(self, groundtruth_class_labels):
-        """Update grouth truth statitistics.
-
-        Args:
-            groundtruth_class_labels: An integer numpy array of length M,
-                representing M class labels of object instances in ground truth
-        """
-        count = defaultdict(lambda: 0)
-        for label in groundtruth_class_labels:
-            count[label] += 1
-        for k in count:
-            self.num_gt_instances_per_class[k] += count[k]
-            self.num_gt_imgs_per_class[k] += 1
-
-    def evaluate(self):
-        """Compute evaluation result.
-
-        Returns:
-            A named tuple with the following fields -
-                average_precision: float numpy array of average precision for
-                    each class.
-                mean_ap: mean average precision of all classes, float scalar
-                precisions: List of precisions, each precision is a float numpy
-                    array
-                recalls: List of recalls, each recall is a float numpy array
-                corloc: numpy float array
-                mean_corloc: Mean CorLoc score for each class, float scalar
-        """
-        if (self.num_gt_instances_per_class == 0).any():
-            logging.info(
-                'The following classes have no ground truth examples: %s',
-                np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) +
-                self.label_id_offset)
-
-        if self.use_weighted_mean_ap:
-            all_scores = np.array([], dtype=float)
-            all_tp_fp_labels = np.array([], dtype=bool)
-
-        for class_index in range(self.num_class):
-            if self.num_gt_instances_per_class[class_index] == 0:
-                continue
-            if not self.scores_per_class[class_index]:
-                scores = np.array([], dtype=float)
-                tp_fp_labels = np.array([], dtype=bool)
-            else:
-                scores = np.concatenate(self.scores_per_class[class_index])
-                tp_fp_labels = np.concatenate(
-                    self.tp_fp_labels_per_class[class_index])
-            if self.use_weighted_mean_ap:
-                all_scores = np.append(all_scores, scores)
-                all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels)
-            precision, recall = metrics.compute_precision_recall(
-                scores, tp_fp_labels,
-                self.num_gt_instances_per_class[class_index])
-            self.precisions_per_class.append(precision)
-            self.recalls_per_class.append(recall)
-            average_precision = metrics.compute_average_precision(
-                precision, recall)
-            self.average_precision_per_class[class_index] = average_precision
-
-        self.corloc_per_class = metrics.compute_cor_loc(
-            self.num_gt_imgs_per_class,
-            self.num_images_correctly_detected_per_class)
-
-        if self.use_weighted_mean_ap:
-            num_gt_instances = np.sum(self.num_gt_instances_per_class)
-            precision, recall = metrics.compute_precision_recall(
-                all_scores, all_tp_fp_labels, num_gt_instances)
-            mean_ap = metrics.compute_average_precision(precision, recall)
-        else:
-            mean_ap = np.nanmean(self.average_precision_per_class)
-        mean_corloc = np.nanmean(self.corloc_per_class)
-        return ObjectDetectionEvalMetrics(
-            self.average_precision_per_class,
-            mean_ap,
-            self.precisions_per_class,
-            self.recalls_per_class,
-            self.corloc_per_class,
-            mean_corloc,
-        )
diff --git a/mmaction/evaluation/functional/ava_evaluation/per_image_evaluation.py b/mmaction/evaluation/functional/ava_evaluation/per_image_evaluation.py
deleted file mode 100644
index 9a6e0d9e40..0000000000
--- a/mmaction/evaluation/functional/ava_evaluation/per_image_evaluation.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Evaluate Object Detection result on a single image.
-
-Annotate each detected result as true positives or false positive according to
-a predefined IOU ratio. Non Maximum Suppression is used by default. Multi class
-detection is supported by default. Based on the settings, per image evaluation
-is either performed on boxes or on object masks.
-"""
-
-import numpy as np
-
-from . import np_box_list, np_box_ops
-
-
-class PerImageEvaluation:
-    """Evaluate detection result of a single image."""
-
-    def __init__(self, num_groundtruth_classes, matching_iou_threshold=0.5):
-        """Initialized PerImageEvaluation by evaluation parameters.
-
-        Args:
-            num_groundtruth_classes: Number of ground truth object classes
-            matching_iou_threshold: A ratio of area intersection to union,
-                which is the threshold to consider whether a detection is true
-                positive or not
-        """
-        self.matching_iou_threshold = matching_iou_threshold
-        self.num_groundtruth_classes = num_groundtruth_classes
-
-    def compute_object_detection_metrics(self,
-                                         detected_boxes,
-                                         detected_scores,
-                                         detected_class_labels,
-                                         groundtruth_boxes,
-                                         groundtruth_class_labels,
-                                         detected_masks=None,
-                                         groundtruth_masks=None):
-        """Evaluates detections as being tp, fp or ignored from a single image.
-
-        The evaluation is done in two stages:
-        1. All detections are matched to non group-of boxes.
-
-        Args:
-            detected_boxes: A float numpy array of shape [N, 4], representing N
-                regions of detected object regions.
-                Each row is of the format [y_min, x_min, y_max, x_max]
-            detected_scores: A float numpy array of shape [N, 1], representing
-                the confidence scores of the detected N object instances.
-            detected_class_labels: A integer numpy array of shape [N, 1],
-                repreneting the class labels of the detected N object
-                instances.
-            groundtruth_boxes: A float numpy array of shape [M, 4],
-                representing M regions of object instances in ground truth
-            groundtruth_class_labels: An integer numpy array of shape [M, 1],
-                representing M class labels of object instances in ground truth
-            detected_masks: (optional) A uint8 numpy array of shape
-                [N, height, width]. If not None, the metrics will be computed
-                based on masks.
-            groundtruth_masks: (optional) A uint8 numpy array of shape
-                [M, height, width].
-
-        Returns:
-            scores: A list of C float numpy arrays. Each numpy array is of
-                shape [K, 1], representing K scores detected with object class
-                label c
-            tp_fp_labels: A list of C boolean numpy arrays. Each numpy array
-                is of shape [K, 1], representing K True/False positive label of
-                object instances detected with class label c
-        """
-        (
-            detected_boxes,
-            detected_scores,
-            detected_class_labels,
-            detected_masks,
-        ) = self._remove_invalid_boxes(
-            detected_boxes,
-            detected_scores,
-            detected_class_labels,
-            detected_masks,
-        )
-        scores, tp_fp_labels = self._compute_tp_fp(
-            detected_boxes=detected_boxes,
-            detected_scores=detected_scores,
-            detected_class_labels=detected_class_labels,
-            groundtruth_boxes=groundtruth_boxes,
-            groundtruth_class_labels=groundtruth_class_labels,
-            detected_masks=detected_masks,
-            groundtruth_masks=groundtruth_masks,
-        )
-
-        return scores, tp_fp_labels
-
-    def _compute_tp_fp(self,
-                       detected_boxes,
-                       detected_scores,
-                       detected_class_labels,
-                       groundtruth_boxes,
-                       groundtruth_class_labels,
-                       detected_masks=None,
-                       groundtruth_masks=None):
-        """Labels true/false positives of detections of an image across all
-        classes.
-
-        Args:
-            detected_boxes: A float numpy array of shape [N, 4], representing N
-                regions of detected object regions.
-                Each row is of the format [y_min, x_min, y_max, x_max]
-            detected_scores: A float numpy array of shape [N, 1], representing
-                the confidence scores of the detected N object instances.
-            detected_class_labels: A integer numpy array of shape [N, 1],
-                repreneting the class labels of the detected N object
-                instances.
-            groundtruth_boxes: A float numpy array of shape [M, 4],
-                representing M regions of object instances in ground truth
-            groundtruth_class_labels: An integer numpy array of shape [M, 1],
-                representing M class labels of object instances in ground truth
-            detected_masks: (optional) A np.uint8 numpy array of shape
-                [N, height, width]. If not None, the scores will be computed
-                based on masks.
-            groundtruth_masks: (optional) A np.uint8 numpy array of shape
-                [M, height, width].
-
-        Returns:
-            result_scores: A list of float numpy arrays. Each numpy array is of
-                shape [K, 1], representing K scores detected with object class
-                label c
-            result_tp_fp_labels: A list of boolean numpy array. Each numpy
-                array is of shape [K, 1], representing K True/False positive
-                label of object instances detected with class label c
-
-        Raises:
-            ValueError: If detected masks is not None but groundtruth masks are
-                None, or the other way around.
-        """
-        if detected_masks is not None and groundtruth_masks is None:
-            raise ValueError(
-                'Detected masks is available but groundtruth masks is not.')
-        if detected_masks is None and groundtruth_masks is not None:
-            raise ValueError(
-                'Groundtruth masks is available but detected masks is not.')
-
-        result_scores = []
-        result_tp_fp_labels = []
-        for i in range(self.num_groundtruth_classes):
-            (gt_boxes_at_ith_class, gt_masks_at_ith_class,
-             detected_boxes_at_ith_class, detected_scores_at_ith_class,
-             detected_masks_at_ith_class) = self._get_ith_class_arrays(
-                 detected_boxes, detected_scores, detected_masks,
-                 detected_class_labels, groundtruth_boxes, groundtruth_masks,
-                 groundtruth_class_labels, i)
-            scores, tp_fp_labels = self._compute_tp_fp_for_single_class(
-                detected_boxes=detected_boxes_at_ith_class,
-                detected_scores=detected_scores_at_ith_class,
-                groundtruth_boxes=gt_boxes_at_ith_class,
-                detected_masks=detected_masks_at_ith_class,
-                groundtruth_masks=gt_masks_at_ith_class,
-            )
-            result_scores.append(scores)
-            result_tp_fp_labels.append(tp_fp_labels)
-        return result_scores, result_tp_fp_labels
-
-    @staticmethod
-    def _get_overlaps_and_scores_box_mode(detected_boxes, detected_scores,
-                                          groundtruth_boxes):
-        """Computes overlaps and scores between detected and groudntruth boxes.
-
-        Args:
-            detected_boxes: A numpy array of shape [N, 4] representing detected
-                box coordinates
-            detected_scores: A 1-d numpy array of length N representing
-                classification score
-            groundtruth_boxes: A numpy array of shape [M, 4] representing
-                ground truth box coordinates
-
-        Returns:
-            iou: A float numpy array of size [num_detected_boxes,
-                num_gt_boxes]. If gt_non_group_of_boxlist.num_boxes() == 0 it
-                will be None.
-            ioa: A float numpy array of size [num_detected_boxes,
-                num_gt_boxes]. If gt_group_of_boxlist.num_boxes() == 0 it will
-                be None.
-            scores: The score of the detected boxlist.
-            num_boxes: Number of non-maximum suppressed detected boxes.
-        """
-        detected_boxlist = np_box_list.BoxList(detected_boxes)
-        detected_boxlist.add_field('scores', detected_scores)
-        gt_non_group_of_boxlist = np_box_list.BoxList(groundtruth_boxes)
-
-        iou = np_box_ops.iou(detected_boxlist.get(),
-                             gt_non_group_of_boxlist.get())
-        scores = detected_boxlist.get_field('scores')
-        num_boxes = detected_boxlist.num_boxes()
-        return iou, None, scores, num_boxes
-
-    def _compute_tp_fp_for_single_class(self,
-                                        detected_boxes,
-                                        detected_scores,
-                                        groundtruth_boxes,
-                                        detected_masks=None,
-                                        groundtruth_masks=None):
-        """Labels boxes detected with the same class from the same image as
-        tp/fp.
-
-        Args:
-            detected_boxes: A numpy array of shape [N, 4] representing detected
-                box coordinates
-            detected_scores: A 1-d numpy array of length N representing
-                classification score
-            groundtruth_boxes: A numpy array of shape [M, 4] representing
-                groundtruth box coordinates
-            detected_masks: (optional) A uint8 numpy array of shape
-                [N, height, width]. If not None, the scores will be computed
-                based on masks.
-            groundtruth_masks: (optional) A uint8 numpy array of shape
-                [M, height, width].
-
-        Returns:
-            Two arrays of the same size, containing all boxes that were
-            evaluated as being true positives or false positives.
-
-            scores: A numpy array representing the detection scores.
-            tp_fp_labels: a boolean numpy array indicating whether a detection
-                is a true positive.
-        """
-        if detected_boxes.size == 0:
-            return np.array([], dtype=float), np.array([], dtype=bool)
-
-        (iou, _, scores,
-         num_detected_boxes) = self._get_overlaps_and_scores_box_mode(
-             detected_boxes=detected_boxes,
-             detected_scores=detected_scores,
-             groundtruth_boxes=groundtruth_boxes)
-
-        if groundtruth_boxes.size == 0:
-            return scores, np.zeros(num_detected_boxes, dtype=bool)
-
-        tp_fp_labels = np.zeros(num_detected_boxes, dtype=bool)
-
-        # The evaluation is done in two stages:
-        # 1. All detections are matched to non group-of boxes.
-        # 2. Detections that are determined as false positives are matched
-        #    against group-of boxes and ignored if matched.
-
-        # Tp-fp evaluation for non-group of boxes (if any).
-        if iou.shape[1] > 0:
-            max_overlap_gt_ids = np.argmax(iou, axis=1)
-            is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool)
-            for i in range(num_detected_boxes):
-                gt_id = max_overlap_gt_ids[i]
-                if iou[i, gt_id] >= self.matching_iou_threshold:
-                    if not is_gt_box_detected[gt_id]:
-                        tp_fp_labels[i] = True
-                        is_gt_box_detected[gt_id] = True
-
-        return scores, tp_fp_labels
-
-    @staticmethod
-    def _get_ith_class_arrays(detected_boxes, detected_scores, detected_masks,
-                              detected_class_labels, groundtruth_boxes,
-                              groundtruth_masks, groundtruth_class_labels,
-                              class_index):
-        """Returns numpy arrays belonging to class with index `class_index`.
-
-        Args:
-            detected_boxes: A numpy array containing detected boxes.
-            detected_scores: A numpy array containing detected scores.
-            detected_masks: A numpy array containing detected masks.
-            detected_class_labels: A numpy array containing detected class
-                labels.
-            groundtruth_boxes: A numpy array containing groundtruth boxes.
-            groundtruth_masks: A numpy array containing groundtruth masks.
-            groundtruth_class_labels: A numpy array containing groundtruth
-                class labels.
-            class_index: An integer index.
-
-        Returns:
-            gt_boxes_at_ith_class: A numpy array containing groundtruth boxes
-                labeled as ith class.
-            gt_masks_at_ith_class: A numpy array containing groundtruth masks
-                labeled as ith class.
-            detected_boxes_at_ith_class: A numpy array containing detected
-                boxes corresponding to the ith class.
-            detected_scores_at_ith_class: A numpy array containing detected
-                scores corresponding to the ith class.
-            detected_masks_at_ith_class: A numpy array containing detected
-                masks corresponding to the ith class.
-        """
-        selected_groundtruth = groundtruth_class_labels == class_index
-        gt_boxes_at_ith_class = groundtruth_boxes[selected_groundtruth]
-        if groundtruth_masks is not None:
-            gt_masks_at_ith_class = groundtruth_masks[selected_groundtruth]
-        else:
-            gt_masks_at_ith_class = None
-        selected_detections = detected_class_labels == class_index
-        detected_boxes_at_ith_class = detected_boxes[selected_detections]
-        detected_scores_at_ith_class = detected_scores[selected_detections]
-        if detected_masks is not None:
-            detected_masks_at_ith_class = detected_masks[selected_detections]
-        else:
-            detected_masks_at_ith_class = None
-        return (gt_boxes_at_ith_class, gt_masks_at_ith_class,
-                detected_boxes_at_ith_class, detected_scores_at_ith_class,
-                detected_masks_at_ith_class)
-
-    @staticmethod
-    def _remove_invalid_boxes(detected_boxes,
-                              detected_scores,
-                              detected_class_labels,
-                              detected_masks=None):
-        """Removes entries with invalid boxes.
-
-        A box is invalid if either its xmax is smaller than its xmin, or its
-        ymax is smaller than its ymin.
-
-        Args:
-            detected_boxes: A float numpy array of size [num_boxes, 4]
-                containing box coordinates in [ymin, xmin, ymax, xmax] format.
-            detected_scores: A float numpy array of size [num_boxes].
-            detected_class_labels: A int32 numpy array of size [num_boxes].
-            detected_masks: A uint8 numpy array of size
-                [num_boxes, height, width].
-
-        Returns:
-            valid_detected_boxes: A float numpy array of size
-                [num_valid_boxes, 4] containing box coordinates in
-                [ymin, xmin, ymax, xmax] format.
-            valid_detected_scores: A float numpy array of size
-                [num_valid_boxes].
-            valid_detected_class_labels: A int32 numpy array of size
-                [num_valid_boxes].
-            valid_detected_masks: A uint8 numpy array of size
-                [num_valid_boxes, height, width].
-        """
-        valid_indices = np.logical_and(
-            detected_boxes[:, 0] < detected_boxes[:, 2],
-            detected_boxes[:, 1] < detected_boxes[:, 3])
-        detected_boxes = detected_boxes[valid_indices]
-        detected_scores = detected_scores[valid_indices]
-        detected_class_labels = detected_class_labels[valid_indices]
-        if detected_masks is not None:
-            detected_masks = detected_masks[valid_indices]
-        return [
-            detected_boxes, detected_scores, detected_class_labels,
-            detected_masks
-        ]
diff --git a/mmaction/evaluation/functional/ava_evaluation/standard_fields.py b/mmaction/evaluation/functional/ava_evaluation/standard_fields.py
deleted file mode 100644
index 8edf46d081..0000000000
--- a/mmaction/evaluation/functional/ava_evaluation/standard_fields.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Contains classes specifying naming conventions used for object detection.
-
-Specifies:
-  InputDataFields: standard fields used by reader/preprocessor/batcher.
-  DetectionResultFields: standard fields returned by object detector.
-"""
-
-
-class InputDataFields:
-    """Names for the input tensors.
-
-    Holds the standard data field names to use for identifying input tensors.
-    This should be used by the decoder to identify keys for the returned
-    tensor_dict containing input tensors. And it should be used by the model to
-    identify the tensors it needs.
-
-    Attributes:
-        image: image.
-        original_image: image in the original input size.
-        key: unique key corresponding to image.
-        source_id: source of the original image.
-        filename: original filename of the dataset (without common path).
-        groundtruth_image_classes: image-level class labels.
-        groundtruth_boxes: coordinates of the ground truth boxes in the image.
-        groundtruth_classes: box-level class labels.
-        groundtruth_label_types: box-level label types (e.g. explicit
-            negative).
-        groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead]
-            is the groundtruth a single object or a crowd.
-        groundtruth_area: area of a groundtruth segment.
-        groundtruth_difficult: is a `difficult` object
-        groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of
-            the same class, forming a connected group, where instances are
-            heavily occluding each other.
-        proposal_boxes: coordinates of object proposal boxes.
-        proposal_objectness: objectness score of each proposal.
-        groundtruth_instance_masks: ground truth instance masks.
-        groundtruth_instance_boundaries: ground truth instance boundaries.
-        groundtruth_instance_classes: instance mask-level class labels.
-        groundtruth_keypoints: ground truth keypoints.
-        groundtruth_keypoint_visibilities: ground truth keypoint visibilities.
-        groundtruth_label_scores: groundtruth label scores.
-        groundtruth_weights: groundtruth weight factor for bounding boxes.
-        num_groundtruth_boxes: number of groundtruth boxes.
-        true_image_shapes: true shapes of images in the resized images, as
-            resized images can be padded with zeros.
-    """
-
-    image = 'image'
-    original_image = 'original_image'
-    key = 'key'
-    source_id = 'source_id'
-    filename = 'filename'
-    groundtruth_image_classes = 'groundtruth_image_classes'
-    groundtruth_boxes = 'groundtruth_boxes'
-    groundtruth_classes = 'groundtruth_classes'
-    groundtruth_label_types = 'groundtruth_label_types'
-    groundtruth_is_crowd = 'groundtruth_is_crowd'
-    groundtruth_area = 'groundtruth_area'
-    groundtruth_difficult = 'groundtruth_difficult'
-    groundtruth_group_of = 'groundtruth_group_of'
-    proposal_boxes = 'proposal_boxes'
-    proposal_objectness = 'proposal_objectness'
-    groundtruth_instance_masks = 'groundtruth_instance_masks'
-    groundtruth_instance_boundaries = 'groundtruth_instance_boundaries'
-    groundtruth_instance_classes = 'groundtruth_instance_classes'
-    groundtruth_keypoints = 'groundtruth_keypoints'
-    groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities'
-    groundtruth_label_scores = 'groundtruth_label_scores'
-    groundtruth_weights = 'groundtruth_weights'
-    num_groundtruth_boxes = 'num_groundtruth_boxes'
-    true_image_shape = 'true_image_shape'
-
-
-class DetectionResultFields:
-    """Naming conventions for storing the output of the detector.
-
-    Attributes:
-        source_id: source of the original image.
-        key: unique key corresponding to image.
-        detection_boxes: coordinates of the detection boxes in the image.
-        detection_scores: detection scores for the detection boxes in the
-            image.
-        detection_classes: detection-level class labels.
-        detection_masks: contains a segmentation mask for each detection box.
-        detection_boundaries: contains an object boundary for each detection
-            box.
-        detection_keypoints: contains detection keypoints for each detection
-            box.
-        num_detections: number of detections in the batch.
-    """
-
-    source_id = 'source_id'
-    key = 'key'
-    detection_boxes = 'detection_boxes'
-    detection_scores = 'detection_scores'
-    detection_classes = 'detection_classes'
-    detection_masks = 'detection_masks'
-    detection_boundaries = 'detection_boundaries'
-    detection_keypoints = 'detection_keypoints'
-    num_detections = 'num_detections'
diff --git a/mmaction/evaluation/functional/ava_utils.py b/mmaction/evaluation/functional/ava_utils.py
index cb739a4a9b..c15737632c 100644
--- a/mmaction/evaluation/functional/ava_utils.py
+++ b/mmaction/evaluation/functional/ava_utils.py
@@ -3,14 +3,13 @@
 # https://github.com/activitynet/ActivityNet/blob/master/
 # Evaluation/get_ava_performance.py. Some unused codes are removed.
 import csv
-import logging
+import multiprocessing
 import time
 from collections import defaultdict
 
 import numpy as np
 
-from .ava_evaluation import object_detection_evaluation as det_eval
-from .ava_evaluation import standard_fields
+from .ava_evaluation import metrics, np_box_list, np_box_ops
 
 
 def det2csv(results, custom_classes):
@@ -42,7 +41,7 @@ def results2csv(results, out_file, custom_classes=None):
     # save space for float
     def to_str(item):
         if isinstance(item, float):
-            return f'{item:.3f}'
+            return f'{item:.4f}'
         return str(item)
 
     with open(out_file, 'w') as f:
@@ -80,7 +79,6 @@ def read_csv(csv_file, class_whitelist=None):
         of score values labels, matching the corresponding label in `labels`.
         If scores are not provided in the csv, then they will default to 1.0.
     """
-    start = time.time()
     entries = defaultdict(list)
     boxes = defaultdict(list)
     labels = defaultdict(list)
@@ -107,7 +105,6 @@ def read_csv(csv_file, class_whitelist=None):
         labels[image_key] = [x[1] for x in entry]
         scores[image_key] = [x[0] for x in entry]
 
-    print_time('read file ' + csv_file.name, start)
     return boxes, labels, scores
 
 
@@ -157,6 +154,51 @@ def read_labelmap(labelmap_file):
     return labelmap, class_ids
 
 
+def get_overlaps_and_scores_box_mode(detected_boxes, detected_scores,
+                                     groundtruth_boxes):
+
+    detected_boxlist = np_box_list.BoxList(detected_boxes)
+    detected_boxlist.add_field('scores', detected_scores)
+    gt_non_group_of_boxlist = np_box_list.BoxList(groundtruth_boxes)
+
+    iou = np_box_ops.iou(detected_boxlist.get(), gt_non_group_of_boxlist.get())
+    scores = detected_boxlist.get_field('scores')
+    num_boxes = detected_boxlist.num_boxes()
+    return iou, scores, num_boxes
+
+
+def tpfp_single(tup, threshold=0.5):
+    gt_bboxes, gt_labels, bboxes, labels, scores = tup
+    ret_scores, ret_tp_fp_labels = dict(), dict()
+    all_labels = list(set(labels))
+    for label in all_labels:
+        gt_bbox = np.array(
+            [x for x, y in zip(gt_bboxes, gt_labels) if y == label],
+            dtype=np.float32).reshape(-1, 4)
+        bbox = np.array([x for x, y in zip(bboxes, labels) if y == label],
+                        dtype=np.float32).reshape(-1, 4)
+        score = np.array([x for x, y in zip(scores, labels) if y == label],
+                         dtype=np.float32).reshape(-1)
+        iou, score, num_boxes = get_overlaps_and_scores_box_mode(
+            bbox, score, gt_bbox)
+        if gt_bbox.size == 0:
+            ret_scores[label] = score
+            ret_tp_fp_labels[label] = np.zeros(num_boxes, dtype=bool)
+            continue
+        tp_fp_labels = np.zeros(num_boxes, dtype=bool)
+        if iou.shape[1] > 0:
+            max_overlap_gt_ids = np.argmax(iou, axis=1)
+            is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool)
+            for i in range(num_boxes):
+                gt_id = max_overlap_gt_ids[i]
+                if iou[i, gt_id] >= threshold:
+                    if not is_gt_box_detected[gt_id]:
+                        tp_fp_labels[i] = True
+                        is_gt_box_detected[gt_id] = True
+        ret_scores[label], ret_tp_fp_labels[label] = score, tp_fp_labels
+    return ret_scores, ret_tp_fp_labels
+
+
 # Seems there is at most 100 detections for each image
 def ava_eval(result_file,
              result_type,
@@ -164,10 +206,11 @@ def ava_eval(result_file,
              ann_file,
              exclude_file,
              verbose=True,
+             ignore_empty_frames=True,
              custom_classes=None):
     """Perform ava evaluation."""
-    assert result_type in ['mAP']
 
+    assert result_type in ['mAP']
     start = time.time()
     categories, class_whitelist = read_labelmap(open(label_file))
     if custom_classes is not None:
@@ -177,9 +220,9 @@ def ava_eval(result_file,
         categories = [cat for cat in categories if cat['id'] in custom_classes]
 
     # loading gt, do not need gt score
-    gt_boxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist)
+    gt_bboxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist)
     if verbose:
-        print_time('Reading detection results', start)
+        print_time('Reading GT results', start)
 
     if exclude_file is not None:
         excluded_keys = read_exclusions(open(exclude_file))
@@ -189,54 +232,69 @@ def ava_eval(result_file,
     start = time.time()
     boxes, labels, scores = read_csv(open(result_file), class_whitelist)
     if verbose:
-        print_time('Reading detection results', start)
-
-    # Evaluation for mAP
-    pascal_evaluator = det_eval.PascalDetectionEvaluator(categories)
+        print_time('Reading Detection results', start)
 
     start = time.time()
-    for image_key in gt_boxes:
-        if verbose and image_key in excluded_keys:
-            logging.info(
-                'Found excluded timestamp in detections: %s.'
-                'It will be ignored.', image_key)
-            continue
-        pascal_evaluator.add_single_ground_truth_image_info(
-            image_key, {
-                standard_fields.InputDataFields.groundtruth_boxes:
-                np.array(gt_boxes[image_key], dtype=float),
-                standard_fields.InputDataFields.groundtruth_classes:
-                np.array(gt_labels[image_key], dtype=int)
-            })
+    all_gt_labels = np.concatenate(list(gt_labels.values()))
+    gt_count = {k: np.sum(all_gt_labels == k) for k in class_whitelist}
+
+    pool = multiprocessing.Pool(32)
+    if ignore_empty_frames:
+        tups = [(gt_bboxes[k], gt_labels[k], boxes[k], labels[k], scores[k])
+                for k in gt_bboxes if k not in excluded_keys]
+    else:
+        tups = [(gt_bboxes.get(k, np.zeros((0, 4), dtype=np.float32)),
+                 gt_labels.get(k, []), boxes[k], labels[k], scores[k])
+                for k in boxes if k not in excluded_keys]
+    rets = pool.map(tpfp_single, tups)
+
     if verbose:
-        print_time('Convert groundtruth', start)
+        print_time('Calculating TP/FP', start)
 
     start = time.time()
-    for image_key in boxes:
-        if verbose and image_key in excluded_keys:
-            logging.info(
-                'Found excluded timestamp in detections: %s.'
-                'It will be ignored.', image_key)
-            continue
-        pascal_evaluator.add_single_detected_image_info(
-            image_key, {
-                standard_fields.DetectionResultFields.detection_boxes:
-                np.array(boxes[image_key], dtype=float),
-                standard_fields.DetectionResultFields.detection_classes:
-                np.array(labels[image_key], dtype=int),
-                standard_fields.DetectionResultFields.detection_scores:
-                np.array(scores[image_key], dtype=float)
-            })
+    scores, tpfps = defaultdict(list), defaultdict(list)
+    for score, tpfp in rets:
+        for k in score:
+            scores[k].append(score[k])
+            tpfps[k].append(tpfp[k])
+
+    cls_AP = []
+    for k in scores:
+        scores[k] = np.concatenate(scores[k])
+        tpfps[k] = np.concatenate(tpfps[k])
+        precision, recall = metrics.compute_precision_recall(
+            scores[k], tpfps[k], gt_count[k])
+        ap = metrics.compute_average_precision(precision, recall)
+        class_name = [x['name'] for x in categories if x['id'] == k]
+        assert len(class_name) == 1
+        class_name = class_name[0]
+        cls_AP.append((k, class_name, ap))
     if verbose:
-        print_time('convert detections', start)
+        print_time('Run Evaluator', start)
+
+    print('Per-class results: ', flush=True)
+    for k, class_name, ap in cls_AP:
+        print(f'Index: {k}, Action: {class_name}: AP: {ap:.4f};', flush=True)
+
+    overall = np.nanmean([x[2] for x in cls_AP])
+    person_movement = np.nanmean([x[2] for x in cls_AP if x[0] <= 14])
+    object_manipulation = np.nanmean([x[2] for x in cls_AP if 14 < x[0] < 64])
+    person_interaction = np.nanmean([x[2] for x in cls_AP if 64 <= x[0]])
+
+    print('Overall Results: ', flush=True)
+    print(f'Overall mAP: {overall:.4f}', flush=True)
+    print(f'Person Movement mAP: {person_movement:.4f}', flush=True)
+    print(f'Object Manipulation mAP: {object_manipulation:.4f}', flush=True)
+    print(f'Person Interaction mAP: {person_interaction:.4f}', flush=True)
+
+    results = {}
+    results['overall'] = overall
+    results['person_movement'] = person_movement
+    results['object_manipulation'] = object_manipulation
+    results['person_interaction'] = person_interaction
 
-    start = time.time()
-    metrics = pascal_evaluator.evaluate()
     if verbose:
-        print_time('run_evaluator', start)
-    for display_name in metrics:
-        print(f'{display_name}=\t{metrics[display_name]}')
-    return {
-        display_name: metrics[display_name]
-        for display_name in metrics if 'ByCategory' not in display_name
-    }
+        for k, class_name, ap in cls_AP:
+            print(f'Class {class_name} AP: {ap:.4f}', flush=True)
+
+    return results
diff --git a/mmaction/evaluation/metrics/ava_metric.py b/mmaction/evaluation/metrics/ava_metric.py
index 66e8fdcc4a..76cc83e6c5 100644
--- a/mmaction/evaluation/metrics/ava_metric.py
+++ b/mmaction/evaluation/metrics/ava_metric.py
@@ -81,6 +81,7 @@ def compute_metrics(self, results: list) -> dict:
             self.label_file,
             self.ann_file,
             self.exclude_file,
+            ignore_empty_frames=True,
             custom_classes=self.custom_classes)
 
         os.remove(temp_file)

From d31224809e3df0b54170a0e60309369dbe9e7953 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Thu, 6 Apr 2023 14:46:56 +0800
Subject: [PATCH 29/36] [Fix] fix flip config of sthsth dataset (#2247)

---
 configs/recognition/tpn/README.md             |  2 +-
 configs/recognition/tpn/metafile.yml          |  6 ++--
 ...retrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py |  4 ++-
 configs/recognition/tsm/README.md             | 10 +++---
 configs/recognition/tsm/metafile.yml          | 34 +++++++++----------
 ...etrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py |  2 +-
 ...etrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py |  5 +--
 ...retrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py |  5 +--
 configs/recognition/tsn/README.md             |  6 ++--
 configs/recognition/tsn/metafile.yml          | 12 +++----
 ...etrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py |  3 +-
 ...retrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py |  3 +-
 12 files changed, 49 insertions(+), 43 deletions(-)

diff --git a/configs/recognition/tpn/README.md b/configs/recognition/tpn/README.md
index 972dbcbc7b..20a488ccb1 100644
--- a/configs/recognition/tpn/README.md
+++ b/configs/recognition/tpn/README.md
@@ -29,7 +29,7 @@ Visual tempo characterizes the dynamics and the temporal scale of an action. Mod
 
 | frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) |      config       |      ckpt       |      log       |
 | :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----------------: | :--------------: | :---------------------: | :--------: | :---------------: | :-------------: | :------------: |
-|          1x1x8          | height 100 | 8x6  | ResNet50 |   TSM    |  48.98   |  78.91   |         x          |         x          | 8 clips x 3 crop |            x            |    8828    | [config](/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20220913-d2f5c300.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log) |
+|          1x1x8          | height 100 | 8x6  | ResNet50 |   TSM    |  51.87   |  79.67   |         x          |         x          | 8 clips x 3 crop |            x            |    8828    | [config](/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20230221-940a3615.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log) |
 
 :::{note}
 
diff --git a/configs/recognition/tpn/metafile.yml b/configs/recognition/tpn/metafile.yml
index 702da581e0..ce953f2e89 100644
--- a/configs/recognition/tpn/metafile.yml
+++ b/configs/recognition/tpn/metafile.yml
@@ -66,8 +66,8 @@ Models:
   Results:
   - Dataset: SthV1
     Metrics:
-      Top 1 Accuracy: 48.98
-      Top 5 Accuracy: 78.91
+      Top 1 Accuracy: 51.87
+      Top 5 Accuracy: 79.67
     Task: Action Recognition
   Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log
-  Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20220913-d2f5c300.pth
+  Weights: (https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20230221-940a3615.pth
diff --git a/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py b/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py
index d833687d6a..b614d725f7 100644
--- a/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py
+++ b/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py
@@ -8,12 +8,14 @@
 ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt'
 ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt'
 ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt'
+
+sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52}
 train_pipeline = [
     dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
     dict(type='RawFrameDecode'),
     dict(type='RandomResizedCrop'),
     dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map),
     dict(type='ColorJitter'),
     dict(type='FormatShape', input_format='NCHW'),
     dict(type='PackActionInputs')
diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md
index ca490117c3..5e5162de83 100644
--- a/configs/recognition/tsm/README.md
+++ b/configs/recognition/tsm/README.md
@@ -32,11 +32,11 @@ The explosive growth in video streaming gives rise to challenges on performing v
 
 ### Something-something V2
 
-| frame sampling strategy | resolution | gpus | backbone  | pretrain | top1 acc | top5 acc |  testing protocol  | FLOPs  | params |               config                |               ckpt                |               log                |
-| :---------------------: | :--------: | :--: | :-------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :---------------------------------: | :-------------------------------: | :------------------------------: |
-|          1x1x8          |  224x224   |  8   | ResNet50  | ImageNet |  60.20   |  86.13   | 8 clips x 10 crop  | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20221122-446d261a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) |
-|         1x1x16          |  224x224   |  8   | ResNet50  | ImageNet |  62.46   |  87.75   | 16 clips x 10 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20221122-b1fb8264.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log) |
-|          1x1x8          |  224x224   |  8   | ResNet101 | ImageNet |  60.49   |  85.99   | 8 clips x 10 crop  | 62.66G | 42.86M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20221122-cb2cc64e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log) |
+| frame sampling strategy | resolution | gpus | backbone  | pretrain | top1 acc | top5 acc | testing protocol  | FLOPs  | params |               config                |               ckpt                |                log                |
+| :---------------------: | :--------: | :--: | :-------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: |
+|          1x1x8          |  224x224   |  8   | ResNet50  | ImageNet |  62.72   |  87.70   | 8 clips x 3 crop  | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) |
+|         1x1x16          |  224x224   |  8   | ResNet50  | ImageNet |  64.16   |  88.61   | 16 clips x 3 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log) |
+|          1x1x8          |  224x224   |  8   | ResNet101 | ImageNet |  63.70   |  88.28   | 8 clips x 3 crop  | 62.66G | 42.86M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log) |
 
 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
diff --git a/configs/recognition/tsm/metafile.yml b/configs/recognition/tsm/metafile.yml
index 5adafb069f..64d37461d4 100644
--- a/configs/recognition/tsm/metafile.yml
+++ b/configs/recognition/tsm/metafile.yml
@@ -178,17 +178,17 @@ Models:
       Parameters: 23.87M
       Pretrained: ImageNet
       Resolution: 224x224
-      Training Data: Kinetics-400
+      Training Data: SthV2
       Training Resources: 8 GPUs
     Modality: RGB
     Results:
-    - Dataset: Kinetics-400
+    - Dataset: SthV2
       Task: Action Recognition
       Metrics:
-        Top 1 Accuracy: 60.20
-        Top 5 Accuracy: 86.13
+        Top 1 Accuracy: 62.72
+        Top 5 Accuracy: 87.70
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20221122-446d261a.pth
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth
 
   - Name: tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb
     Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py
@@ -196,22 +196,22 @@ Models:
     Metadata:
       Architecture: ResNet50
       Batch Size: 16
-      Epochs: 100
+      Epochs: 50
       FLOPs: 65.75G
       Parameters: 23.87M
       Pretrained: ImageNet
       Resolution: 224x224
-      Training Data: Kinetics-400
+      Training Data: SthV2
       Training Resources: 8 GPUs
     Modality: RGB
     Results:
-    - Dataset: Kinetics-400
+    - Dataset: SthV2
       Task: Action Recognition
       Metrics:
-        Top 1 Accuracy: 62.46
-        Top 5 Accuracy: 87.75
+        Top 1 Accuracy: 64.16
+        Top 5 Accuracy: 88.61
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20221122-b1fb8264.pth
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth
 
   - Name: tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb
     Config: configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py
@@ -219,19 +219,19 @@ Models:
     Metadata:
       Architecture: ResNet101
       Batch Size: 16
-      Epochs: 100
+      Epochs: 50
       FLOPs: 62.66G
       Parameters: 42.86M
       Pretrained: ImageNet
       Resolution: 224x224
-      Training Data: Kinetics-400
+      Training Data: SthV2
       Training Resources: 8 GPUs
     Modality: RGB
     Results:
-    - Dataset: Kinetics-400
+    - Dataset: SthV2
       Task: Action Recognition
       Metrics:
-        Top 1 Accuracy: 60.49
-        Top 5 Accuracy: 85.99
+        Top 1 Accuracy: 63.70
+        Top 5 Accuracy: 88.28
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20221122-cb2cc64e.pth
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth
diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py
index 9429730700..7cb4b48ac7 100644
--- a/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py
+++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py
@@ -1,6 +1,6 @@
 _base_ = ['tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py']
 
 # model settings
-r101_checkpoint = 'https://download.pytorch.org/models/resnet101-cd907fc2.pth'
+r101_checkpoint = 'torchvision://resnet101'
 
 model = dict(backbone=dict(pretrained=r101_checkpoint, depth=101))
diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py
index 691e39c2b2..36b1eefcf0 100644
--- a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py
+++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py
@@ -4,6 +4,7 @@
 
 file_client_args = dict(io_backend='disk')
 
+sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
 train_pipeline = [
     dict(type='DecordInit', **file_client_args),
     dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16),
@@ -17,7 +18,7 @@
         max_wh_scale_gap=1,
         num_fixed_crops=13),
     dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map),
     dict(type='FormatShape', input_format='NCHW'),
     dict(type='PackActionInputs')
 ]
@@ -46,7 +47,7 @@
         test_mode=True),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 256)),
-    dict(type='TenCrop', crop_size=224),
+    dict(type='ThreeCrop', crop_size=256),
     dict(type='FormatShape', input_format='NCHW'),
     dict(type='PackActionInputs')
 ]
diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py
index ba9c393593..8248bcb02b 100644
--- a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py
+++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py
@@ -11,6 +11,7 @@
 
 file_client_args = dict(io_backend='disk')
 
+sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
 train_pipeline = [
     dict(type='DecordInit', **file_client_args),
     dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
@@ -24,7 +25,7 @@
         max_wh_scale_gap=1,
         num_fixed_crops=13),
     dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map),
     dict(type='FormatShape', input_format='NCHW'),
     dict(type='PackActionInputs')
 ]
@@ -53,7 +54,7 @@
         twice_sample=True),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 256)),
-    dict(type='TenCrop', crop_size=224),
+    dict(type='ThreeCrop', crop_size=256),
     dict(type='FormatShape', input_format='NCHW'),
     dict(type='PackActionInputs')
 ]
diff --git a/configs/recognition/tsn/README.md b/configs/recognition/tsn/README.md
index d34d1ab433..1b6e34fdc1 100644
--- a/configs/recognition/tsn/README.md
+++ b/configs/recognition/tsn/README.md
@@ -32,8 +32,8 @@ Deep convolutional networks have achieved great success for visual recognition i
 
 | frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc |  testing protocol  | FLOPs  | params |              config              |                           ckpt |                            log |
 | :---------------------: | :-------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :------------------------------: | -----------------------------: | -----------------------------: |
-|          1x1x8          | MultiStep |  224x224   |  8   | ResNet50 | ImageNet |  34.85   |  66.37   | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20221122-ad2dbb37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log) |
-|         1x1x16          | MultiStep |  224x224   |  8   | ResNet50 | ImageNet |  36.55   |  68.00   | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20221122-ee13c8e2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log) |
+|          1x1x8          | MultiStep |  224x224   |  8   | ResNet50 | ImageNet |  35.51   |  67.09   | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20230313-06ad7d03.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log) |
+|         1x1x16          | MultiStep |  224x224   |  8   | ResNet50 | ImageNet |  36.91   |  68.77   | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20230221-85bcc1c3.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log) |
 
 ### Using backbones from 3rd-party in TSN
 
@@ -49,7 +49,7 @@ It's possible and convenient to use a 3rd-party backbone for TSN under the frame
 |          1x1x3          | MultiStep |  224x224   |  8   |   DenseNet161    | ImageNet |  72.07   |  90.15   | 25 clips x 10 crop | 194.6G | 27.36M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb_20220906-5f4c0daf.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.log) |
 |          1x1x3          | MultiStep |  224x224   |  8   | Swin Transformer | ImageNet |  77.03   |  92.61   | 25 clips x 10 crop | 386.7G | 87.15M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb_20220906-65ed814e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.log) |
 
-1. Note that some backbones in TIMM are not supported due to multiple reasons. Please refer to to [PR #880](https://github.com/open-mmlab/mmaction2/pull/880) for details.
+1. Note that some backbones in TIMM are not supported due to multiple reasons. Please refer to [PR #880](https://github.com/open-mmlab/mmaction2/pull/880) for details.
 2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
 3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
 
diff --git a/configs/recognition/tsn/metafile.yml b/configs/recognition/tsn/metafile.yml
index e618ed71cc..37943e673b 100644
--- a/configs/recognition/tsn/metafile.yml
+++ b/configs/recognition/tsn/metafile.yml
@@ -210,10 +210,10 @@ Models:
     - Dataset: Kinetics-400
       Task: Action Recognition
       Metrics:
-        Top 1 Accuracy: 34.85
-        Top 5 Accuracy: 66.37
+        Top 1 Accuracy: 35.51
+        Top 5 Accuracy: 67.09
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20221122-ad2dbb37.pth
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20230313-06ad7d03.pth
 
   - Name: tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb
     Config: configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py
@@ -233,7 +233,7 @@ Models:
     - Dataset: Kinetics-400
       Task: Action Recognition
       Metrics:
-        Top 1 Accuracy: 36.55
-        Top 5 Accuracy: 68.00
+        Top 1 Accuracy: 36.91
+        Top 5 Accuracy: 68.77
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20221122-ee13c8e2.pth
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20230221-85bcc1c3.pth
diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py
index 5797a6f596..15fde3ba79 100644
--- a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py
+++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py
@@ -2,6 +2,7 @@
 
 file_client_args = dict(io_backend='disk')
 
+sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
 train_pipeline = [
     dict(type='DecordInit', **file_client_args),
     dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16),
@@ -15,7 +16,7 @@
         max_wh_scale_gap=1,
         num_fixed_crops=13),
     dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map),
     dict(type='FormatShape', input_format='NCHW'),
     dict(type='PackActionInputs')
 ]
diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py
index 39113ba5b3..a94f7b3b22 100644
--- a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py
+++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py
@@ -14,6 +14,7 @@
 
 file_client_args = dict(io_backend='disk')
 
+sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
 train_pipeline = [
     dict(type='DecordInit', **file_client_args),
     dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
@@ -26,7 +27,7 @@
         random_crop=False,
         max_wh_scale_gap=1),
     dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
+    dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map),
     dict(type='FormatShape', input_format='NCHW'),
     dict(type='PackActionInputs')
 ]

From 5f3eb48234faa0d8cc6d51b8bb825beb17da35b0 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Thu, 6 Apr 2023 14:47:08 +0800
Subject: [PATCH 30/36] [Feat] support calculate confusion matrix (#2274)

---
 mmaction/evaluation/metrics/__init__.py       |   4 +-
 mmaction/evaluation/metrics/acc_metric.py     | 209 ++++++++++++++++++
 mmaction/structures/action_data_sample.py     | 102 ++++++++-
 tests/evaluation/metrics/test_acc_metric.py   | 117 +++++++++-
 tests/models/recognizers/test_recognizer2d.py |   4 +
 tools/analysis_tools/confusion_matrix.py      | 129 +++++++++++
 6 files changed, 551 insertions(+), 14 deletions(-)
 create mode 100644 tools/analysis_tools/confusion_matrix.py

diff --git a/mmaction/evaluation/metrics/__init__.py b/mmaction/evaluation/metrics/__init__.py
index 46988d39c1..0493dae036 100644
--- a/mmaction/evaluation/metrics/__init__.py
+++ b/mmaction/evaluation/metrics/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .acc_metric import AccMetric
+from .acc_metric import AccMetric, ConfusionMatrix
 from .anet_metric import ANetMetric
 from .ava_metric import AVAMetric
 
-__all__ = ['AccMetric', 'AVAMetric', 'ANetMetric']
+__all__ = ['AccMetric', 'AVAMetric', 'ANetMetric', 'ConfusionMatrix']
diff --git a/mmaction/evaluation/metrics/acc_metric.py b/mmaction/evaluation/metrics/acc_metric.py
index ca6b4623f8..512b089327 100644
--- a/mmaction/evaluation/metrics/acc_metric.py
+++ b/mmaction/evaluation/metrics/acc_metric.py
@@ -1,9 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 from collections import OrderedDict
+from itertools import product
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 
+import mmengine
 import numpy as np
+import torch
 from mmengine.evaluator import BaseMetric
 
 from mmaction.evaluation import (get_weighted_score, mean_average_precision,
@@ -12,6 +15,17 @@
 from mmaction.registry import METRICS
 
 
+def to_tensor(value):
+    """Convert value to torch.Tensor."""
+    if isinstance(value, np.ndarray):
+        value = torch.from_numpy(value)
+    elif isinstance(value, Sequence) and not mmengine.is_str(value):
+        value = torch.tensor(value)
+    elif not isinstance(value, torch.Tensor):
+        raise TypeError(f'{type(value)} is not an available argument.')
+    return value
+
+
 @METRICS.register_module()
 class AccMetric(BaseMetric):
     """Accuracy evaluation metric."""
@@ -183,3 +197,198 @@ def label2array(num, label):
         arr = np.zeros(num, dtype=np.float32)
         arr[label] = 1.
         return arr
+
+
+@METRICS.register_module()
+class ConfusionMatrix(BaseMetric):
+    r"""A metric to calculate confusion matrix for single-label tasks.
+
+    Args:
+        num_classes (int, optional): The number of classes. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+
+    Examples:
+
+        1. The basic usage.
+
+        >>> import torch
+        >>> from mmaction.evaluation import ConfusionMatrix
+        >>> y_pred = [0, 1, 1, 3]
+        >>> y_true = [0, 2, 1, 3]
+        >>> ConfusionMatrix.calculate(y_pred, y_true, num_classes=4)
+        tensor([[1, 0, 0, 0],
+                [0, 1, 0, 0],
+                [0, 1, 0, 0],
+                [0, 0, 0, 1]])
+        >>> # plot the confusion matrix
+        >>> import matplotlib.pyplot as plt
+        >>> y_score = torch.rand((1000, 10))
+        >>> y_true = torch.randint(10, (1000, ))
+        >>> matrix = ConfusionMatrix.calculate(y_score, y_true)
+        >>> ConfusionMatrix().plot(matrix)
+        >>> plt.show()
+
+        2. In the config file
+
+        .. code:: python
+
+            val_evaluator = dict(type='ConfusionMatrix')
+            test_evaluator = dict(type='ConfusionMatrix')
+    """  # noqa: E501
+    default_prefix = 'confusion_matrix'
+
+    def __init__(self,
+                 num_classes: Optional[int] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device, prefix)
+
+        self.num_classes = num_classes
+
+    def process(self, data_batch, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            pred_scores = data_sample.get('pred_scores')
+            gt_label = data_sample['gt_labels']['item']
+            if pred_scores is not None:
+                pred_label = pred_scores['item'].argmax(dim=0, keepdim=True)
+                self.num_classes = pred_scores['item'].size(0)
+            else:
+                pred_label = data_sample['pred_labels']['item']
+
+            self.results.append({
+                'pred_label': pred_label,
+                'gt_label': gt_label
+            })
+
+    def compute_metrics(self, results: list) -> dict:
+        pred_labels = []
+        gt_labels = []
+        for result in results:
+            pred_labels.append(result['pred_label'])
+            gt_labels.append(result['gt_label'])
+        confusion_matrix = ConfusionMatrix.calculate(
+            torch.cat(pred_labels),
+            torch.cat(gt_labels),
+            num_classes=self.num_classes)
+        return {'result': confusion_matrix}
+
+    @staticmethod
+    def calculate(pred, target, num_classes=None) -> dict:
+        """Calculate the confusion matrix for single-label task.
+
+        Args:
+            pred (torch.Tensor | np.ndarray | Sequence): The prediction
+                results. It can be labels (N, ), or scores of every
+                class (N, C).
+            target (torch.Tensor | np.ndarray | Sequence): The target of
+                each prediction with shape (N, ).
+            num_classes (Optional, int): The number of classes. If the ``pred``
+                is label instead of scores, this argument is required.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: The confusion matrix.
+        """
+        pred = to_tensor(pred)
+        target_label = to_tensor(target).int()
+
+        assert pred.size(0) == target_label.size(0), \
+            f"The size of pred ({pred.size(0)}) doesn't match "\
+            f'the target ({target_label.size(0)}).'
+        assert target_label.ndim == 1
+
+        if pred.ndim == 1:
+            assert num_classes is not None, \
+                'Please specify the `num_classes` if the `pred` is labels ' \
+                'intead of scores.'
+            pred_label = pred
+        else:
+            num_classes = num_classes or pred.size(1)
+            pred_label = torch.argmax(pred, dim=1).flatten()
+
+        with torch.no_grad():
+            indices = num_classes * target_label + pred_label
+            matrix = torch.bincount(indices, minlength=num_classes**2)
+            matrix = matrix.reshape(num_classes, num_classes)
+
+        return matrix
+
+    @staticmethod
+    def plot(confusion_matrix: torch.Tensor,
+             include_values: bool = False,
+             cmap: str = 'viridis',
+             classes: Optional[List[str]] = None,
+             colorbar: bool = True,
+             show: bool = True):
+        """Draw a confusion matrix by matplotlib.
+
+        Modified from `Scikit-Learn
+        <https://github.com/scikit-learn/scikit-learn/blob/dc580a8ef/sklearn/metrics/_plot/confusion_matrix.py#L81>`_
+
+        Args:
+            confusion_matrix (torch.Tensor): The confusion matrix to draw.
+            include_values (bool): Whether to draw the values in the figure.
+                Defaults to False.
+            cmap (str): The color map to use. Defaults to use "viridis".
+            classes (list[str], optional): The names of categories.
+                Defaults to None, which means to use index number.
+            colorbar (bool): Whether to show the colorbar. Defaults to True.
+            show (bool): Whether to show the figure immediately.
+                Defaults to True.
+        """  # noqa: E501
+        import matplotlib.pyplot as plt
+
+        fig, ax = plt.subplots(figsize=(10, 10))
+
+        num_classes = confusion_matrix.size(0)
+
+        im_ = ax.imshow(confusion_matrix, interpolation='nearest', cmap=cmap)
+        text_ = None
+        cmap_min, cmap_max = im_.cmap(0), im_.cmap(1.0)
+
+        if include_values:
+            text_ = np.empty_like(confusion_matrix, dtype=object)
+
+            # print text with appropriate color depending on background
+            thresh = (confusion_matrix.max() + confusion_matrix.min()) / 2.0
+
+            for i, j in product(range(num_classes), range(num_classes)):
+                color = cmap_max if confusion_matrix[i,
+                                                     j] < thresh else cmap_min
+
+                text_cm = format(confusion_matrix[i, j], '.2g')
+                text_d = format(confusion_matrix[i, j], 'd')
+                if len(text_d) < len(text_cm):
+                    text_cm = text_d
+
+                text_[i, j] = ax.text(
+                    j, i, text_cm, ha='center', va='center', color=color)
+
+        display_labels = classes or np.arange(num_classes)
+
+        if colorbar:
+            fig.colorbar(im_, ax=ax)
+        ax.set(
+            xticks=np.arange(num_classes),
+            yticks=np.arange(num_classes),
+            xticklabels=display_labels,
+            yticklabels=display_labels,
+            ylabel='True label',
+            xlabel='Predicted label',
+        )
+        ax.invert_yaxis()
+        ax.xaxis.tick_top()
+
+        ax.set_ylim((num_classes - 0.5, -0.5))
+        # Automatically rotate the x labels.
+        fig.autofmt_xdate(ha='center')
+
+        if show:
+            plt.show()
+        return fig
diff --git a/mmaction/structures/action_data_sample.py b/mmaction/structures/action_data_sample.py
index c75f6654a1..196b080136 100644
--- a/mmaction/structures/action_data_sample.py
+++ b/mmaction/structures/action_data_sample.py
@@ -1,25 +1,105 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Union
+from numbers import Number
+from typing import Sequence, Union
 
 import numpy as np
 import torch
 from mmengine.structures import BaseDataElement, InstanceData, LabelData
+from mmengine.utils import is_str
+
+
+def format_label(value: Union[torch.Tensor, np.ndarray, Sequence,
+                              int]) -> torch.Tensor:
+    """Convert various python types to label-format tensor.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int`.
+
+    Args:
+        value (torch.Tensor | numpy.ndarray | Sequence | int): Label value.
+
+    Returns:
+        :obj:`torch.Tensor`: The foramtted label tensor.
+    """
+
+    # Handle single number
+    if isinstance(value, (torch.Tensor, np.ndarray)) and value.ndim == 0:
+        value = int(value.item())
+
+    if isinstance(value, np.ndarray):
+        value = torch.from_numpy(value).to(torch.long)
+    elif isinstance(value, Sequence) and not is_str(value):
+        value = torch.tensor(value).to(torch.long)
+    elif isinstance(value, int):
+        value = torch.LongTensor([value])
+    elif not isinstance(value, torch.Tensor):
+        raise TypeError(f'Type {type(value)} is not an available label type.')
+    assert value.ndim == 1, \
+        f'The dims of value should be 1, but got {value.ndim}.'
+
+    return value
+
+
+def format_score(value: Union[torch.Tensor, np.ndarray,
+                              Sequence]) -> torch.Tensor:
+    """Convert various python types to score-format tensor.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`.
+
+    Args:
+        value (torch.Tensor | numpy.ndarray | Sequence): Score values.
+
+    Returns:
+        :obj:`torch.Tensor`: The foramtted score tensor.
+    """
+
+    if isinstance(value, np.ndarray):
+        value = torch.from_numpy(value).float()
+    elif isinstance(value, Sequence) and not is_str(value):
+        value = torch.tensor(value).float()
+    elif not isinstance(value, torch.Tensor):
+        raise TypeError(f'Type {type(value)} is not an available label type.')
+    assert value.ndim == 1, \
+        f'The dims of value should be 1, but got {value.ndim}.'
+
+    return value
 
 
 class ActionDataSample(BaseDataElement):
 
-    def set_gt_labels(self, value: Union[int,
-                                         np.ndarray]) -> 'ActionDataSample':
+    def set_gt_labels(
+        self, value: Union[np.ndarray, torch.Tensor, Sequence[Number], Number]
+    ) -> 'ActionDataSample':
         """Set label of ``gt_labels``."""
-        if isinstance(value, int):
-            value = torch.LongTensor([value])
-        elif isinstance(value, np.ndarray):
-            value = torch.from_numpy(value)
-        else:
-            raise TypeError(f'Type {type(value)} is not an '
-                            f'available label type.')
+        label_data = getattr(self, '_gt_label', LabelData())
+        label_data.item = format_label(value)
+        self.gt_labels = label_data
+        return self
 
-        self.gt_labels = LabelData(item=value)
+    def set_pred_label(
+        self, value: Union[np.ndarray, torch.Tensor, Sequence[Number], Number]
+    ) -> 'ActionDataSample':
+        """Set label of ``pred_label``."""
+        label_data = getattr(self, '_pred_label', LabelData())
+        label_data.item = format_label(value)
+        self.pred_labels = label_data
+        return self
+
+    def set_pred_score(self, value: torch.Tensor) -> 'ActionDataSample':
+        """Set score of ``pred_label``."""
+        label_data = getattr(self, '_pred_label', LabelData())
+        label_data.item = format_score(value)
+        if hasattr(self, 'num_classes'):
+            assert len(label_data.item) == self.num_classes, \
+                f'The length of score {len(label_data.item)} should be '\
+                f'equal to the num_classes {self.num_classes}.'
+        else:
+            self.set_field(
+                name='num_classes',
+                value=len(label_data.item),
+                field_type='metainfo')
+        self.pred_scores = label_data
         return self
 
     @property
diff --git a/tests/evaluation/metrics/test_acc_metric.py b/tests/evaluation/metrics/test_acc_metric.py
index 273155858c..7c70adb7d6 100644
--- a/tests/evaluation/metrics/test_acc_metric.py
+++ b/tests/evaluation/metrics/test_acc_metric.py
@@ -1,7 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
 import torch
 
-from mmaction.evaluation import AccMetric
+from mmaction.evaluation import AccMetric, ConfusionMatrix
+from mmaction.registry import METRICS
+from mmaction.structures import ActionDataSample
 
 
 def generate_data(num_classes=5, random_label=False):
@@ -41,3 +46,113 @@ def test_accmetric():
     assert eval_results['mean1'] == 1.0
     assert eval_results['mmit_mean_average_precision'] == 1.0
     return
+
+
+class TestConfusionMatrix(TestCase):
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+        pred = [
+            ActionDataSample().set_pred_score(i).set_pred_label(
+                j).set_gt_labels(k).to_dict() for i, j, k in zip([
+                    torch.tensor([0.7, 0.0, 0.3]),
+                    torch.tensor([0.5, 0.2, 0.3]),
+                    torch.tensor([0.4, 0.5, 0.1]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                    torch.tensor([0.0, 0.0, 1.0]),
+                ], [0, 0, 1, 2, 2, 2], [0, 0, 1, 2, 1, 0])
+        ]
+
+        # Test with score (use score instead of label if score exists)
+        metric = METRICS.build(dict(type='ConfusionMatrix'))
+        metric.process(None, pred)
+        res = metric.evaluate(6)
+        self.assertIsInstance(res, dict)
+        self.assertTensorEqual(
+            res['confusion_matrix/result'],
+            torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+        # Test with label
+        for sample in pred:
+            del sample['pred_scores']
+        metric = METRICS.build(dict(type='ConfusionMatrix'))
+        metric.process(None, pred)
+        with self.assertRaisesRegex(AssertionError,
+                                    'Please specify the `num_classes`'):
+            metric.evaluate(6)
+
+        metric = METRICS.build(dict(type='ConfusionMatrix', num_classes=3))
+        metric.process(None, pred)
+        self.assertIsInstance(res, dict)
+        self.assertTensorEqual(
+            res['confusion_matrix/result'],
+            torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+    def test_calculate(self):
+        y_true = np.array([0, 0, 1, 2, 1, 0])
+        y_label = torch.tensor([0, 0, 1, 2, 2, 2])
+        y_score = [
+            [0.7, 0.0, 0.3],
+            [0.5, 0.2, 0.3],
+            [0.4, 0.5, 0.1],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+            [0.0, 0.0, 1.0],
+        ]
+
+        # Test with score
+        cm = ConfusionMatrix.calculate(y_score, y_true)
+        self.assertIsInstance(cm, torch.Tensor)
+        self.assertTensorEqual(
+            cm, torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+        # Test with label
+        with self.assertRaisesRegex(AssertionError,
+                                    'Please specify the `num_classes`'):
+            ConfusionMatrix.calculate(y_label, y_true)
+
+        cm = ConfusionMatrix.calculate(y_label, y_true, num_classes=3)
+        self.assertIsInstance(cm, torch.Tensor)
+        self.assertTensorEqual(
+            cm, torch.tensor([
+                [2, 0, 1],
+                [0, 1, 1],
+                [0, 0, 1],
+            ]))
+
+        # Test with invalid inputs
+        with self.assertRaisesRegex(TypeError, "<class 'str'> is not"):
+            ConfusionMatrix.calculate(y_label, 'hi')
+
+    def test_plot(self):
+        import matplotlib.pyplot as plt
+
+        cm = torch.tensor([[2, 0, 1], [0, 1, 1], [0, 0, 1]])
+        fig = ConfusionMatrix.plot(cm, include_values=True, show=False)
+
+        self.assertIsInstance(fig, plt.Figure)
+
+    def assertTensorEqual(self,
+                          tensor: torch.Tensor,
+                          value: float,
+                          msg=None,
+                          **kwarg):
+        tensor = tensor.to(torch.float32)
+        value = torch.tensor(value).float()
+        try:
+            torch.testing.assert_allclose(tensor, value, **kwarg)
+        except AssertionError as e:
+            self.fail(self._formatMessage(msg, str(e)))
diff --git a/tests/models/recognizers/test_recognizer2d.py b/tests/models/recognizers/test_recognizer2d.py
index 1acde7fc9c..773bc0806f 100644
--- a/tests/models/recognizers/test_recognizer2d.py
+++ b/tests/models/recognizers/test_recognizer2d.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+import pytest
 import torch
 
 from mmaction.registry import MODELS
@@ -191,6 +194,7 @@ def test_trn():
         recognizer(one_img, gradcam=True)
 
 
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
 def test_tpn():
     register_all_modules()
     config = get_recognizer_cfg(
diff --git a/tools/analysis_tools/confusion_matrix.py b/tools/analysis_tools/confusion_matrix.py
new file mode 100644
index 0000000000..224b8364bc
--- /dev/null
+++ b/tools/analysis_tools/confusion_matrix.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+
+import torch
+from mmengine import dump, list_from_file, load
+from mmengine.config import Config, DictAction
+from mmengine.evaluator import Evaluator
+from mmengine.runner import Runner
+
+from mmaction.evaluation import ConfusionMatrix
+from mmaction.registry import DATASETS
+from mmaction.utils import register_all_modules
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Eval a checkpoint and draw the confusion matrix.')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        'ckpt_or_result',
+        type=str,
+        help='The checkpoint file (.pth) or '
+        'dumpped predictions pickle file (.pkl).')
+    parser.add_argument('--out', help='the file to save the confusion matrix.')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='whether to display the metric result by matplotlib if supports.')
+    parser.add_argument(
+        '--show-path', type=str, help='Path to save the visualization image.')
+    parser.add_argument(
+        '--include-values',
+        action='store_true',
+        help='To draw the values in the figure.')
+    parser.add_argument('--label-file', default=None, help='Labelmap file')
+    parser.add_argument(
+        '--target-classes',
+        type=int,
+        nargs='+',
+        default=[],
+        help='Selected classes to evaluate, and remains will be neglected')
+    parser.add_argument(
+        '--cmap',
+        type=str,
+        default='viridis',
+        help='The color map to use. Defaults to "viridis".')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # register all modules in mmaction into the registries
+    # do not init the default scope here because it will be init in the runner
+    register_all_modules(init_default_scope=False)
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    if args.ckpt_or_result.endswith('.pth'):
+        # Set confusion matrix as the metric.
+        cfg.test_evaluator = dict(type='ConfusionMatrix')
+
+        cfg.load_from = str(args.ckpt_or_result)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            cfg.work_dir = tmpdir
+            runner = Runner.from_cfg(cfg)
+            classes = runner.test_loop.dataloader.dataset.metainfo.get(
+                'classes')
+            cm = runner.test()['confusion_matrix/result']
+    else:
+        predictions = load(args.ckpt_or_result)
+        evaluator = Evaluator(ConfusionMatrix())
+        metrics = evaluator.offline_evaluate(predictions, None)
+        cm = metrics['confusion_matrix/result']
+        try:
+            # Try to build the dataset.
+            dataset = DATASETS.build({
+                **cfg.test_dataloader.dataset, 'pipeline': []
+            })
+            classes = dataset.metainfo.get('classes')
+        except Exception:
+            classes = None
+
+    if args.label_file is not None:
+        classes = list_from_file(args.label_file)
+    if classes is None:
+        num_classes = cm.shape[0]
+        classes = list(range(num_classes))
+
+    if args.target_classes:
+        assert len(args.target_classes) > 1, \
+            'please ensure select more than one class'
+        target_idx = torch.tensor(args.target_classes)
+        cm = cm[target_idx][:, target_idx]
+        classes = [classes[idx] for idx in target_idx]
+
+    if args.out is not None:
+        dump(cm, args.out)
+
+    if args.show or args.show_path is not None:
+        fig = ConfusionMatrix.plot(
+            cm,
+            show=args.show,
+            classes=classes,
+            include_values=args.include_values,
+            cmap=args.cmap)
+        if args.show_path is not None:
+            fig.savefig(args.show_path)
+            print(f'The confusion matrix is saved at {args.show_path}.')
+
+
+if __name__ == '__main__':
+    main()

From db11ac2c372f92887b42b63af73da195fb01618b Mon Sep 17 00:00:00 2001
From: Kai Hu <kaiorhu@gmail.com>
Date: Thu, 6 Apr 2023 03:39:57 -0400
Subject: [PATCH 31/36] [Doc] for README, optimizers and data pipeline (#2341)

---
 README.md                                     | 111 +++---
 .../en/advanced_guides/customize_optimizer.md | 329 ++++++++++++++++++
 docs/en/advanced_guides/customize_pipeline.md | 152 ++++++++
 3 files changed, 549 insertions(+), 43 deletions(-)
 create mode 100644 docs/en/advanced_guides/customize_optimizer.md
 create mode 100644 docs/en/advanced_guides/customize_pipeline.md

diff --git a/README.md b/README.md
index d08d49d2c3..25b703306c 100644
--- a/README.md
+++ b/README.md
@@ -56,33 +56,51 @@
 
 English | [简体中文](/README_zh-CN.md)
 
-## Introduction
+## 📄 Table of Contents
+
+- [🥳 🚀 What's New](#--whats-new-)
+- [📖 Introduction](#-introduction-)
+- [🎁 Major Features](#-major-features-)
+- [🛠️ Installation](#-installation-)
+- [👀 Model Zoo](#-model-zoo-)
+- [👨‍🏫 Get Started](#-get-started-)
+- [🎫 License](#-license-)
+- [🖊️ Citation](#️-citation-)
+- [🙌 Contributing](#-contributing-)
+- [🤝 Acknowledgement](#-acknowledgement-)
+- [🏗️ Projects in OpenMMLab](#-projects-in-openmmlab-)
+
+## 🥳 🚀 What's New [🔝](#-table-of-contents)
+
+**The default branch has been switched to `1.x` from `master`, and we encourage users to migrate to the latest version with more supported models, stronger pre-training checkpoints and simpler coding. Please refer to [Migration Guide](https://mmaction2.readthedocs.io/en/1.x/migration.html) for more details.**
+
+**Release (2023.02.10)**: v1.0.0rc3 with the following new features:
+
+- Support Action Recognition model UniFormer V1(ICLR'2022), UniFormer V2(Arxiv'2022).
+- Support training MViT V2(CVPR'2022), and MaskFeat(CVPR'2022) fine-tuning.
+- Add a new handy interface for inference MMAction2 models ([demo](https://github.com/open-mmlab/mmaction2/blob/dev-1.x/demo/README.md#inferencer))
+
+## 📖 Introduction [🔝](#-table-of-contents)
 
 MMAction2 is an open-source toolbox for video understanding based on PyTorch.
 It is a part of the [OpenMMLab](http://openmmlab.com/) project.
 
-The 1.x branch works with **PyTorch 1.6+**.
-
 <div align="center">
-  <div style="float:left;margin-right:10px;">
-  <img src="https://github.com/open-mmlab/mmaction2/raw/1.x/resources/mmaction2_overview.gif" width="380px"><br>
-    <p style="font-size:1.5vw;">Action Recognition Results on Kinetics-400</p>
-  </div>
-  <div style="float:right;margin-right:0px;">
-  <img src="https://user-images.githubusercontent.com/34324155/123989146-2ecae680-d9fb-11eb-916b-b9db5563a9e5.gif" width="380px"><br>
-    <p style="font-size:1.5vw;">Skeleton-based Action Recognition Results on NTU-RGB+D-120</p>
-  </div>
+  <img src="https://github.com/open-mmlab/mmaction2/raw/master/resources/mmaction2_overview.gif" width="380px">
+  <img src="https://user-images.githubusercontent.com/34324155/123989146-2ecae680-d9fb-11eb-916b-b9db5563a9e5.gif" width="380px">
+  <p style="font-size:1.5vw;"> Action Recognition on Kinetics-400 (left) and Skeleton-based Action Recognition on NTU-RGB+D-120 (right)</p>
 </div>
+
 <div align="center">
   <img src="https://user-images.githubusercontent.com/30782254/155710881-bb26863e-fcb4-458e-b0c4-33cd79f96901.gif" width="580px"/><br>
     <p style="font-size:1.5vw;">Skeleton-based Spatio-Temporal Action Detection and Action Recognition Results on Kinetics-400</p>
 </div>
 <div align="center">
-  <img src="https://github.com/open-mmlab/mmaction2/raw/1.x/resources/spatio-temporal-det.gif" width="800px"/><br>
+  <img src="https://github.com/open-mmlab/mmaction2/raw/master/resources/spatio-temporal-det.gif" width="800px"/><br>
     <p style="font-size:1.5vw;">Spatio-Temporal Action Detection Results on AVA-2.1</p>
 </div>
 
-## Major Features
+## 🎁 Major Features [🔝](#-table-of-contents)
 
 - **Modular design**: We decompose a video understanding framework into different components. One can easily construct a customized video understanding framework by combining different modules.
 
@@ -90,17 +108,14 @@ The 1.x branch works with **PyTorch 1.6+**.
 
 - **Well tested and documented**: We provide detailed documentation and API reference, as well as unit tests.
 
-## What's New
+## 🛠️ Installation [🔝](#-table-of-contents)
 
-**Release (2023.02.10)**: v1.0.0rc3 with the following new features:
+MMAction2 depends on [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine), [MMDetection](https://github.com/open-mmlab/mmdetection) (optional, for spatial-temporal detection tasks) and [MMPose](https://github.com/open-mmlab/mmpose) (optional, for skeleton based tasks).
 
-- Support Action Recognition model UniFormer V1(ICLR'2022), UniFormer V2(Arxiv'2022).
-- Support training MViT V2(CVPR'2022), and MaskFeat(CVPR'2022) fine-tuning.
-- Add a new handy interface for inference MMAction2 models ([demo](https://github.com/open-mmlab/mmaction2/blob/dev-1.x/demo/README.md#inferencer))
+Please refer to [install.md](https://mmaction2.readthedocs.io/en/1.x/get_started.html) for detailed instructions.
 
-## Installation
-
-Please refer to [install.md](https://mmaction2.readthedocs.io/en/1.x/get_started.html) for more detailed instructions.
+<details close>
+<summary>Quick instructions</summary>
 
 ```shell
 conda create --name openmmlab python=3.8 -y
@@ -116,7 +131,15 @@ git checkout 1.x
 pip3 install -e .
 ```
 
-## Supported Methods
+</details>
+
+## 👀 Model Zoo [🔝](#-table-of-contents)
+
+Results and models are available in the [model zoo](https://mmaction2.readthedocs.io/en/1.x/modelzoo.html).
+
+<details close>
+
+<summary>Supported model</summary>
 
 <table style="margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;">
   <tr>
@@ -161,7 +184,6 @@ pip3 install -e .
     <td colspan="5" style="font-weight:bold;">Action Localization</td>
   </tr>
   <tr>
-    <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/localization/ssn/README.md">SSN</a> (ICCV'2017)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/localization/bsn/README.md">BSN</a> (ECCV'2018)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/localization/bmn/README.md">BMN</a> (ICCV'2019)</td>
     <td></td>
@@ -185,17 +207,19 @@ pip3 install -e .
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/skeleton/2s-agcn/README.md">2s-AGCN</a> (CVPR'2019)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/skeleton/posec3d/README.md">PoseC3D</a> (CVPR'2022)</td>
     <td><a href="https://github.com/open-mmlab/mmaction2/blob/1.x/configs/skeleton/stgcnpp/README.md">STGCN++</a> (ArXiv'2022)</td>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/dev-1.x/projects/ctrgcn/README.md">CTRGCN</a> (CVPR'2021)</td>
+  </tr>
+  <tr>
+    <td><a href="https://github.com/open-mmlab/mmaction2/blob/dev-1.x/projects/msg3d/README.md">MSG3D</a> (CVPR'2020)</td>
     <td></td>
   </tr>
 </table>
 
-Results and models are available in the *README.md* of each method's config directory.
-A summary can be found on the [**model zoo**](https://mmaction2.readthedocs.io/en/1.x/modelzoo.html) page.
+</details>
 
-We will keep up with the latest progress of the community and support more popular algorithms and frameworks.
-If you have any feature requests, please feel free to leave a comment in [Issues](https://github.com/open-mmlab/mmaction2/issues/19).
+<details close>
 
-## Supported Datasets
+<summary>Supported dataset</summary>
 
 <table style="margin-left:auto;margin-right:auto;font-size:1.3vw;padding:3px 5px;text-align:center;vertical-align:center;">
   <tr>
@@ -254,31 +278,32 @@ If you have any feature requests, please feel free to leave a comment in [Issues
   </tr>
 </table>
 
-Datasets marked with * are not fully supported yet, but related dataset preparation steps are provided. A summary can be found on the [**Supported Datasets**](https://mmaction2.readthedocs.io/en/latest/supported_datasets.html) page.
-
-## Data Preparation
-
-Please refer to [data_preparation.md](docs/en/user_guides/2_data_prepare.md) for a general knowledge of data preparation.
+</details>
 
-## FAQ
+## 👨‍🏫 Get Started [🔝](#-table-of-contents)
 
-Please refer to [FAQ](docs/en/notes/faq.md) for frequently asked questions.
+For tutorials, we provide the following user guides for basic usage:
 
-## Projects built on MMAction2
+- [Migration from MMAction2 0.X](https://mmaction2.readthedocs.io/en/1.x/migration.html)
+- [Learn about Configs](https://mmaction2.readthedocs.io/en/1.x/user_guides/1_config.html#)
+- [Prepare Datasets](https://mmaction2.readthedocs.io/en/1.x/user_guides/2_data_prepare.html)
+- [Inference with Existing Models](https://mmaction2.readthedocs.io/en/1.x/user_guides/3_inference.html)
+- [Training and Testing](https://mmaction2.readthedocs.io/en/1.x/user_guides/4_train_test.html)
 
-Currently, there are many research works and projects built on MMAction2 by users from community, such as:
+<details close>
+<summary>Research works built on MMAction2 by users from community</summary>
 
 - Video Swin Transformer. [\[paper\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer)
 - Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR)
 - Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS)
 
-etc., check [projects.md](docs/en/notes/projects.md) to see all related projects.
+</details>
 
-## License
+## 🎫 License [🔝](#-table-of-contents)
 
 This project is released under the [Apache 2.0 license](LICENSE).
 
-## Citation
+## 🖊️ Citation [🔝](#-table-of-contents)
 
 If you find this project useful in your research, please consider cite:
 
@@ -291,17 +316,17 @@ If you find this project useful in your research, please consider cite:
 }
 ```
 
-## Contributing
+## 🙌 Contributing [🔝](#-table-of-contents)
 
 We appreciate all contributions to improve MMAction2. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING.md) in MMCV for more details about the contributing guideline.
 
-## Acknowledgement
+## 🤝 Acknowledgement [🔝](#-table-of-contents)
 
 MMAction2 is an open-source project that is contributed by researchers and engineers from various colleges and companies.
 We appreciate all the contributors who implement their methods or add new features and users who give valuable feedback.
 We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and develop their new models.
 
-## Projects in OpenMMLab
+## 🏗️ Projects in OpenMMLab [🔝](#-table-of-contents)
 
 - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
diff --git a/docs/en/advanced_guides/customize_optimizer.md b/docs/en/advanced_guides/customize_optimizer.md
new file mode 100644
index 0000000000..d69aa0ff90
--- /dev/null
+++ b/docs/en/advanced_guides/customize_optimizer.md
@@ -0,0 +1,329 @@
+# Customize Optimizer
+
+In this tutorial, we will introduce some methods about how to build the optimizer and learning rate scheduler for your tasks.
+
+- [Customize Optimizer](#customize-optimizer)
+  - [Build optimizers using optim_wrapper](#build-optimizers-using-optim_wrapper)
+  - [Customize parameter schedules](#customize-parameter-schedules)
+  - [Add new optimizers or constructors](#add-new-optimizers-or-constructors)
+
+## Build optimizers using optim_wrapper
+
+We use the `optim_wrapper` field to configure the strategies of optimization, which includes choices of the optimizer, parameter-wise configurations, gradient clipping and accumulation. A simple example can be:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.0003, weight_decay=0.0001)
+)
+```
+
+In the above example, a SGD optimizer with learning rate 0.0003 and weight decay 0.0001 is built.
+
+### Use optimizers supported by PyTorch
+
+We support all the optimizers implemented by PyTorch. To use a different optimizer, just need to change the `optimizer` field of config files. For example, if you want to use `torch.optim.Adam`, the modification in the config file could be as the following.
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer = dict(
+        type='Adam',
+        lr=0.001,
+        betas=(0.9, 0.999),
+        eps=1e-08,
+        weight_decay=0,
+        amsgrad=False),
+)
+```
+
+First we need to change the value of `type` to the desired optimizer name supported in `torch.optim`. Next we add necessary arguments of this optimizer to the `optimizer` field. The above config will build the following optimizer:
+
+```python
+torch.optim.Adam(lr=0.001,
+                 betas=(0.9, 0.999),
+                 eps=1e-08,
+                 weight_decay=0,
+                 amsgrad=False)
+```
+
+### Parameter-wise finely configuration
+
+Some models may have parameter-specific settings for optimization, for example, no weight decay to the BatchNorm layers or using different learning rates for different network layers.
+To finely configure them, we can use the `paramwise_cfg` argument in `optim_wrapper`.
+
+- **Set different hyper-parameter multipliers for different types of parameters.**
+
+  For instance, we can set `norm_decay_mult=0.` in `paramwise_cfg` to change the weight decay of weight and bias of normalization layers to zero.
+
+  ```python
+  optim_wrapper = dict(
+      optimizer=dict(type='SGD', lr=0.8, weight_decay=1e-4),
+      paramwise_cfg=dict(norm_decay_mult=0.))
+  ```
+
+  More types of parameters are supported to configured, list as follow:
+
+  - `lr_mult`: Multiplier for learning rate of all parameters.
+  - `decay_mult`: Multiplier for weight decay of all parameters.
+  - `bias_lr_mult`: Multiplier for learning rate of bias (Not include normalization layers' biases and deformable convolution layers' offsets). Defaults to 1.
+  - `bias_decay_mult`: Multiplier for weight decay of bias (Not include normalization layers' biases and deformable convolution layers' offsets). Defaults to 1.
+  - `norm_decay_mult`: Multiplier for weight decay of weigh and bias of normalization layers. Defaults to 1.
+  - `dwconv_decay_mult`: Multiplier for weight decay of depth-wise convolution layers. Defaults to 1.
+  - `bypass_duplicate`: Whether to bypass duplicated parameters. Defaults to `False`.
+  - `dcn_offset_lr_mult`: Multiplier for learning rate of deformable convolution layers. Defaults to 1.
+
+- **Set different hyper-parameter multipliers for specific parameters.**
+
+  MMAction2 can use `custom_keys` in `paramwise_cfg` to specify different parameters to use different learning rates or weight decay.
+
+  For example, to set all learning rates and weight decays of `backbone.layer0` to 0, the rest of `backbone` remains the same as the optimizer and the learning rate of `head` to 0.001, use the configs below.
+
+  ```python
+  optim_wrapper = dict(
+      optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+      paramwise_cfg=dict(
+          custom_keys={
+              'backbone.layer0': dict(lr_mult=0, decay_mult=0),
+              'backbone': dict(lr_mult=1),
+              'head': dict(lr_mult=0.1)
+          }))
+  ```
+
+### Gradient clipping
+
+During the training process, the loss function may get close to a cliffy region and cause gradient explosion. And gradient clipping is helpful to stabilize the training process. More introduction can be found in [this page](https://paperswithcode.com/method/gradient-clipping).
+
+Currently we support `clip_grad` option in `optim_wrapper` for gradient clipping, refers to [PyTorch Documentation](torch.nn.utils.clip_grad_norm_).
+
+Here is an example:
+
+```python
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+    # norm_type: type of the used p-norm, here norm_type is 2.
+    clip_grad=dict(max_norm=35, norm_type=2))
+```
+
+### Gradient accumulation
+
+When computing resources are lacking, the batch size can only be set to a small value, which may affect the performance of models. Gradient accumulation can be used to solve this problem. We support `accumulative_counts` option in `optim_wrapper` for gradient accumulation.
+
+Here is an example:
+
+```python
+train_dataloader = dict(batch_size=64)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+    accumulative_counts=4)
+```
+
+Indicates that during training, back-propagation is performed every 4 iters. And the above is equivalent to:
+
+```python
+train_dataloader = dict(batch_size=256)
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001))
+```
+
+## Customize parameter schedules
+
+In training, the optimzation parameters such as learing rate, momentum, are usually not fixed but changing through iterations or epochs. PyTorch supports several learning rate schedulers, which are not sufficient for complex strategies. In MMAction2, we provide `param_scheduler` for better controls of different parameter schedules.
+
+### Customize learning rate schedules
+
+Learning rate schedulers are widely used to improve performance. We support most of the PyTorch schedulers, including `ExponentialLR`, `LinearLR`, `StepLR`, `MultiStepLR`, etc.
+
+All available learning rate scheduler can be found {external+mmengine:ref}`here <scheduler>`, and the
+names of learning rate schedulers end with `LR`.
+
+- **Single learning rate schedule**
+
+  In most cases, we use only one learning rate schedule for simplicity. For instance, [`MultiStepLR`](mmengine.optim.MultiStepLR) is used as the default learning rate schedule for ResNet. Here, `param_scheduler` is a dictionary.
+
+  ```python
+  param_scheduler = dict(
+      type='MultiStepLR',
+      by_epoch=True,
+      milestones=[100, 150],
+      gamma=0.1)
+  ```
+
+  Or, we want to use the [`CosineAnnealingLR`](mmengine.optim.CosineAnnealingLR) scheduler to decay the learning rate:
+
+  ```python
+  param_scheduler = dict(
+      type='CosineAnnealingLR',
+      by_epoch=True,
+      T_max=num_epochs)
+  ```
+
+- **Multiple learning rate schedules**
+
+  In some of the training cases, multiple learning rate schedules are applied for higher accuracy. For example ,in the early stage, training is easy to be volatile, and warmup is a technique to reduce volatility.
+  The learning rate will increase gradually from a minor value to the expected value by warmup and decay afterwards by other schedules.
+
+  In MMAction2, simply combines desired schedules in `param_scheduler` as a list can achieve the warmup strategy.
+
+  Here are some examples:
+
+  1. linear warmup during the first 50 iters.
+
+  ```python
+    param_scheduler = [
+        # linear warm-up by iters
+        dict(type='LinearLR',
+            start_factor=0.001,
+            by_epoch=False,  # by iters
+            end=50),  # only warm up for first 50 iters
+        # main learing rate schedule
+        dict(type='MultiStepLR',
+            by_epoch=True,
+            milestones=[8, 11],
+            gamma=0.1)
+    ]
+  ```
+
+  2. linear warmup and update lr by iter during the first 10 epochs.
+
+  ```python
+    param_scheduler = [
+        # linear warm-up by epochs in [0, 10) epochs
+        dict(type='LinearLR',
+            start_factor=0.001,
+            by_epoch=True,
+            end=10,
+            convert_to_iter_based=True,  # Update learning rate by iter.
+        ),
+        # use CosineAnnealing schedule after 10 epochs
+        dict(type='CosineAnnealingLR', by_epoch=True, begin=10)
+    ]
+  ```
+
+  Notice that, we use `begin` and `end` arguments here to assign the valid range, which is \[`begin`, `end`) for this schedule. And the range unit is defined by `by_epoch` argument. If not specified, the `begin` is 0 and the `end` is the max epochs or iterations.
+
+  If the ranges for all schedules are not continuous, the learning rate will stay constant in ignored range, otherwise all valid schedulers will be executed in order in a specific stage, which behaves the same as PyTorch [`ChainedScheduler`](torch.optim.lr_scheduler.ChainedScheduler).
+
+### Customize momentum schedules
+
+We support using momentum schedulers to modify the optimizer's momentum according to learning rate, which could make the loss converge in a faster way. The usage is the same as learning rate schedulers.
+
+All available learning rate scheduler can be found {external+mmengine:ref}`here <scheduler>`, and the
+names of momentum rate schedulers end with `Momentum`.
+
+Here is an example:
+
+```python
+param_scheduler = [
+    # the lr scheduler
+    dict(type='LinearLR', ...),
+    # the momentum scheduler
+    dict(type='LinearMomentum',
+         start_factor=0.001,
+         by_epoch=False,
+         begin=0,
+         end=1000)
+]
+```
+
+## Add new optimizers or constructors
+
+This part will modify the MMAction2 source code or add code to the MMAction2 framework, beginners can skip it.
+
+### Add new optimizers
+
+In academic research and industrial practice, it may be necessary to use optimization methods not implemented by MMAction2, and you can add them through the following methods.
+
+#### 1. Implement a new optimizer
+
+Assume you want to add an optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`.
+You need to create a new file under `mmaction/engine/optimizers`, and implement the new optimizer in the file, for example, in `mmaction/engine/optimizers/my_optimizer.py`:
+
+```python
+from torch.optim import Optimizer
+from mmaction.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class MyOptimizer(Optimizer):
+
+    def __init__(self, a, b, c):
+        ...
+
+    def step(self, closure=None):
+        ...
+```
+
+#### 2. Import the optimizer
+
+To find the above module defined above, this module should be imported during the running. First import it in the `mmaction/engine/optimizers/__init__.py` to add it into the `mmaction.engine` package.
+
+```python
+# In mmaction/engine/optimizers/__init__.py
+...
+from .my_optimizer import MyOptimizer # MyOptimizer maybe other class name
+
+__all__ = [..., 'MyOptimizer']
+```
+
+During running, we will automatically import the `mmaction.engine` package and register the `MyOptimizer` at the same time.
+
+#### 3. Specify the optimizer in the config file
+
+Then you can use `MyOptimizer` in the `optim_wrapper.optimizer` field of config files.
+
+```python
+optim_wrapper = dict(
+    optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value))
+```
+
+### Add new optimizer constructors
+
+Some models may have some parameter-specific settings for optimization, like different weight decay rate for all `BatchNorm` layers.
+
+Although we already can use [the `optim_wrapper.paramwise_cfg` field](#parameter-wise-finely-configuration) to
+configure various parameter-specific optimizer settings. It may still not cover your need.
+
+Of course, you can modify it. By default, we use the [`DefaultOptimWrapperConstructor`](mmengine.optim.DefaultOptimWrapperConstructor)
+class to deal with the construction of optimizer. And during the construction, it fine-grainedly configures the optimizer settings of
+different parameters according to the `paramwise_cfg`，which could also serve as a template for new optimizer constructor.
+
+You can overwrite these behaviors by add new optimizer constructors.
+
+```python
+# In mmaction/engine/optimizers/my_optim_constructor.py
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class MyOptimWrapperConstructor:
+
+    def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
+        ...
+
+    def __call__(self, model):
+        ...
+```
+
+And then, import it and use it almost like [the optimizer tutorial](#add-new-optimizers).
+
+1. Import it in the `mmaction/engine/optimizers/__init__.py` to add it into the `mmaction.engine` package.
+
+   ```python
+   # In mmaction/engine/optimizers/__init__.py
+   ...
+   from .my_optim_constructor import MyOptimWrapperConstructor
+
+   __all__ = [..., 'MyOptimWrapperConstructor']
+   ```
+
+2. Use `MyOptimWrapperConstructor` in the `optim_wrapper.constructor` field of config files.
+
+   ```python
+   optim_wrapper = dict(
+       constructor=dict(type='MyOptimWrapperConstructor'),
+       optimizer=...,
+       paramwise_cfg=...,
+   )
+   ```
diff --git a/docs/en/advanced_guides/customize_pipeline.md b/docs/en/advanced_guides/customize_pipeline.md
new file mode 100644
index 0000000000..719f806d3f
--- /dev/null
+++ b/docs/en/advanced_guides/customize_pipeline.md
@@ -0,0 +1,152 @@
+# Customize Data Pipeline
+
+In this tutorial, we will introduce some methods about how to build the data pipeline (i.e., data transformations)for your tasks.
+
+- [Customize Data Pipeline](#customize-data-pipeline)
+  - [Design of Dataset and Data pipelines](#design-of-dataset-and-data-pipelines)
+  - [Modify the training/test pipeline](#modify-the-training/test-pipeline)
+  - [Add new data transforms](#add-new-data-transforms)
+
+## Design of Data pipelines
+
+The data pipeline means how to process the sample dict when indexing a sample from the dataset. And it
+consists of a sequence of data transforms. Each data transform takes a dict as input, processes it, and outputs a dict for the next data transform.
+
+Here is a data pipeline example for SlowFast training on Kinetics for `VideoDataset`. It first use [`decord`](https://github.com/dmlc/decord) to read the raw videos and randomly sample one video clip (the clip has 32 frames, and the interval between frames is 2). Next it applies the random resized crop and random horizontal flip to all frames. Finally the data shape is formatted as `NCTHW`.
+
+```python
+train_pipeline = [
+    dict(type='DecordInit',),
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+```
+
+All available data transforms in MMAction2 can be found in the [data transforms docs](mmaction.datasets.transforms).
+
+## Modify the training/test pipeline
+
+The data pipeline in MMAction2 is pretty flexible. You can control almost every step of the data
+preprocessing from the config file, but on the other hand, you may be confused facing so many options.
+
+Here is a common practice and guidance for action recognition tasks.
+
+### Loading
+
+At the beginning of a data pipeline, we usually need to load videos. But if you already extract the frames, you should use `RawFrameDecode` and change the dataset type to `RawframeDataset`:
+
+```python
+train_pipeline = [
+    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+```
+
+If you want to load data from files with special formats or special locations, you can [implement a new loading
+transform](#add-new-data-transforms) and add it at the beginning of the data pipeline.
+
+### Sampling frames and other processing
+
+During training and testing, we may have different strategies to sample frames from the video.
+
+For example, during testing of SlowFast, we sample multiple clips uniformly:
+
+```python
+test_pipeline = [
+    ...
+    dict(
+        type='SampleFrames',
+        clip_len=32,
+        frame_interval=2,
+        num_clips=10,
+        test_mode=True),
+    ...
+]
+```
+
+In the above example, 10 clips of 32-frame video clips will be sampled for each video. We use `test_mode=True` to uniformly sample these clips (as opposed to randomly sample during training).
+
+Another example is that TSN/TSM models sample multiple segments from the video:
+
+```python
+train_pipeline = [
+    ...
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+    ...
+]
+```
+
+```{note}
+Usually, the data augmentation part in the data pipeline handles only video-wise transforms, but not transforms
+like video normalization or mixup/cutmix. It's because we can do image normalization and mixup/cutmix on batch data
+to accelerate with GPUs. To configure video normalization and mixup/cutmix, please use the [data preprocessor]
+(mmaction.models.utils.data_preprocessor).
+```
+
+### Formatting
+
+The formatting is to collect training data from the data information dict and convert these data to
+model-friendly format.
+
+In most cases, you can simply use [`PackActionInputs`](mmaction.datasets.transforms.PackActionInputs), and it will
+convert the image in NumPy array format to PyTorch tensor, and pack the ground truth categories information and
+other meta information as a dict-like object [`ActionDataSample`](mmaction.structures.ActionDataSample).
+
+```python
+train_pipeline = [
+    ...
+    dict(type='PackActionInputs'),
+]
+```
+
+## Add new data transforms
+
+1. Write a new data transform in any file, e.g., `my_transform.py`, and place it in
+   the folder `mmaction/datasets/transforms/`. The data transform class needs to inherit
+   the [`mmcv.transforms.BaseTransform`](mmcv.transforms.BaseTransform) class and override
+   the `transform` method which takes a dict as input and returns a dict.
+
+   ```python
+   from mmcv.transforms import BaseTransform
+   from mmaction.datasets import TRANSFORMS
+
+   @TRANSFORMS.register_module()
+   class MyTransform(BaseTransform):
+
+       def transform(self, results):
+           # Modify the data information dict `results`.
+           return results
+   ```
+
+2. Import the new class in the `mmaction/datasets/transforms/__init__.py`.
+
+   ```python
+   ...
+   from .my_transform import MyTransform
+
+   __all__ = [
+       ..., 'MyTransform'
+   ]
+   ```
+
+3. Use it in config files.
+
+   ```python
+   train_pipeline = [
+       ...
+       dict(type='MyTransform'),
+       ...
+   ]
+   ```

From 6cc912ba414c1f3b6cdc8cb3ea310ac3bf44d004 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Thu, 6 Apr 2023 16:16:10 +0800
Subject: [PATCH 32/36] [doc] add doc for 1.x branch (#2312)

---
 .gitignore                                    |   2 +-
 configs/detection/acrn/README.md              |   4 +-
 configs/detection/lfb/README.md               |   4 +-
 configs/detection/slowfast/README.md          |   4 +-
 configs/detection/slowonly/README.md          |   4 +-
 configs/localization/bmn/README.md            |   4 +-
 configs/recognition/c2d/README.md             |   4 +-
 configs/recognition/c3d/README.md             |   4 +-
 configs/recognition/csn/README.md             |   4 +-
 configs/recognition/i3d/README.md             |   4 +-
 configs/recognition/mvit/README.md            |   2 +-
 configs/recognition/omnisource/README.md      |   4 +-
 configs/recognition/r2plus1d/README.md        |   4 +-
 configs/recognition/slowfast/README.md        |   4 +-
 configs/recognition/slowonly/README.md        |   4 +-
 configs/recognition/swin/README.md            |   4 +-
 configs/recognition/tanet/README.md           |   4 +-
 configs/recognition/timesformer/README.md     |   4 +-
 configs/recognition/tin/README.md             |   4 +-
 configs/recognition/tpn/README.md             |   4 +-
 configs/recognition/trn/README.md             |   4 +-
 configs/recognition/tsm/README.md             |   4 +-
 configs/recognition/tsn/README.md             |   4 +-
 ...ed-r50_8xb32-1x1x3-100e_kinetics400-rgb.py |  15 +-
 configs/recognition/uniformer/README.md       |   2 +-
 configs/recognition/uniformerv2/README.md     |   2 +-
 configs/recognition/videomae/README.md        |   2 +-
 configs/recognition/x3d/README.md             |   2 +-
 configs/recognition_audio/resnet/README.md    |   4 +-
 configs/skeleton/2s-agcn/README.md            |   6 +-
 configs/skeleton/posec3d/README.md            |   4 +-
 .../posec3d/custom_dataset_training.md        |   2 +-
 configs/skeleton/stgcn/README.md              |   6 +-
 configs/skeleton/stgcnpp/README.md            |   6 +-
 docs/en/advanced_guides/customize_dataset.md  | 122 ++++++++
 docs/en/advanced_guides/customize_logging.md  | 163 +++++++++++
 docs/en/advanced_guides/customize_models.md   |   1 +
 .../en/advanced_guides/customize_optimizer.md |  11 +
 docs/en/advanced_guides/customize_pipeline.md |   7 +-
 docs/en/advanced_guides/dataflow.md           |   1 +
 docs/en/advanced_guides/depoly.md             |   0
 .../contribution_guide.md                     |   3 +-
 docs/en/{notes => get_started}/faq.md         |   2 +-
 .../{ => get_started}/guide_to_framework.md   |   0
 .../installation.md}                          |  16 +-
 docs/en/get_started/overview.md               |  97 +++++++
 docs/en/get_started/quick_run.md              | 221 +++++++++++++++
 docs/en/index.rst                             |  60 ++--
 docs/en/merge_docs.sh                         |  49 +++-
 docs/en/notes/{projects.md => ecosystem.md}   |   2 +-
 docs/en/notes/pytorch2.0.md                   |  21 ++
 docs/en/stat.py                               | 144 +++++-----
 docs/en/supported_datasets.md                 |  36 +++
 docs/en/{user_guides => }/useful_tools.md     |   4 +-
 docs/en/user_guides/2_data_prepare.md         | 152 ----------
 .../{3_inference.md => Inference.md}          |   4 +-
 .../en/user_guides/{1_config.md => config.md} |  15 +-
 docs/en/user_guides/prepare_dataset.md        | 263 ++++++++++++++++++
 .../{4_train_test.md => train_test.md}        |   2 +-
 docs/en/user_guides/visualization.md          |  20 --
 docs/zh_cn/index.rst                          |   2 +-
 docs/zh_cn/user_guides/3_inference.md         |   2 +-
 src/pytorch-sphinx-theme                      |   1 +
 tools/visualizations/browse_dataset.py        |   8 +-
 tools/visualizations/vis_scheduler.py         | 115 ++++----
 65 files changed, 1264 insertions(+), 419 deletions(-)
 create mode 100644 docs/en/advanced_guides/customize_dataset.md
 create mode 100644 docs/en/advanced_guides/customize_logging.md
 create mode 100644 docs/en/advanced_guides/customize_models.md
 create mode 100644 docs/en/advanced_guides/dataflow.md
 create mode 100644 docs/en/advanced_guides/depoly.md
 rename docs/en/{notes => get_started}/contribution_guide.md (93%)
 rename docs/en/{notes => get_started}/faq.md (99%)
 rename docs/en/{ => get_started}/guide_to_framework.md (100%)
 rename docs/en/{get_started.md => get_started/installation.md} (95%)
 create mode 100644 docs/en/get_started/overview.md
 create mode 100644 docs/en/get_started/quick_run.md
 rename docs/en/notes/{projects.md => ecosystem.md} (98%)
 create mode 100644 docs/en/notes/pytorch2.0.md
 create mode 100644 docs/en/supported_datasets.md
 rename docs/en/{user_guides => }/useful_tools.md (98%)
 delete mode 100644 docs/en/user_guides/2_data_prepare.md
 rename docs/en/user_guides/{3_inference.md => Inference.md} (95%)
 rename docs/en/user_guides/{1_config.md => config.md} (98%)
 create mode 100644 docs/en/user_guides/prepare_dataset.md
 rename docs/en/user_guides/{4_train_test.md => train_test.md} (99%)
 delete mode 100644 docs/en/user_guides/visualization.md
 create mode 160000 src/pytorch-sphinx-theme

diff --git a/.gitignore b/.gitignore
index 3e40ace4d5..1d637fa156 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,7 +65,7 @@ instance/
 .scrapy
 
 # Sphinx documentation
-docs/_build/
+docs/*/_build/
 
 # PyBuilder
 target/
diff --git a/configs/detection/acrn/README.md b/configs/detection/acrn/README.md
index d08efb6d2d..054853c35a 100644
--- a/configs/detection/acrn/README.md
+++ b/configs/detection/acrn/README.md
@@ -49,7 +49,7 @@ python tools/train.py configs/detection/acrn/slowfast-acrn_kinetics400-pretraine
     --seed 0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -66,7 +66,7 @@ python tools/test.py configs/detection/acrn/slowfast-acrn_kinetics400-pretrained
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/detection/lfb/README.md b/configs/detection/lfb/README.md
index dabb3a1b46..51af1377c8 100644
--- a/configs/detection/lfb/README.md
+++ b/configs/detection/lfb/README.md
@@ -76,7 +76,7 @@ python tools/train.py configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrain
   --validate --seed 0 --deterministic
 ```
 
-For more details and optional arguments infos, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details and optional arguments infos, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -101,7 +101,7 @@ python tools/test.py configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretraine
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/detection/slowfast/README.md b/configs/detection/slowfast/README.md
index bae71fd040..f82273adcc 100644
--- a/configs/detection/slowfast/README.md
+++ b/configs/detection/slowfast/README.md
@@ -54,7 +54,7 @@ python tools/train.py configs/detection/slowfast/slowfast_kinetics400-pretrained
     --seed 0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -71,7 +71,7 @@ python tools/test.py configs/detection/slowfast/slowfast_kinetics400-pretrained-
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/detection/slowonly/README.md b/configs/detection/slowonly/README.md
index e8af3d84ea..ff0f7bf641 100644
--- a/configs/detection/slowonly/README.md
+++ b/configs/detection/slowonly/README.md
@@ -75,7 +75,7 @@ python tools/train.py configs/detection/slowonly/slowonly_kinetics400-pretrained
     --seed 0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -92,7 +92,7 @@ python tools/test.py configs/detection/slowonly/slowonly_kinetics400-pretrained-
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/localization/bmn/README.md b/configs/localization/bmn/README.md
index 2f49330743..ec2f625a95 100644
--- a/configs/localization/bmn/README.md
+++ b/configs/localization/bmn/README.md
@@ -42,7 +42,7 @@ Train BMN model on ActivityNet features dataset.
 bash tools/dist_train.sh configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py 2
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -52,7 +52,7 @@ Test BMN on ActivityNet feature dataset.
 python3 tools/test.py  configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py CHECKPOINT.PTH
 ```
 
-For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/c2d/README.md b/configs/recognition/c2d/README.md
index 651193dad2..a1b58493f7 100644
--- a/configs/recognition/c2d/README.md
+++ b/configs/recognition/c2d/README.md
@@ -49,7 +49,7 @@ python tools/train.py configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_
     --seed 0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -66,7 +66,7 @@ python tools/test.py configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_k
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/c3d/README.md b/configs/recognition/c3d/README.md
index 958119f048..9e2af4229e 100644
--- a/configs/recognition/c3d/README.md
+++ b/configs/recognition/c3d/README.md
@@ -44,7 +44,7 @@ python tools/train.py configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1
     --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -61,7 +61,7 @@ python tools/test.py configs/recognition/c3d_sports1m-pretrained_8xb30-16x1x1-45
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/csn/README.md b/configs/recognition/csn/README.md
index 77c3aaf900..b09e365829 100644
--- a/configs/recognition/csn/README.md
+++ b/configs/recognition/csn/README.md
@@ -52,7 +52,7 @@ python tools/train.py configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-
     --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -69,7 +69,7 @@ python tools/test.py configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-3
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/i3d/README.md b/configs/recognition/i3d/README.md
index e181eaf195..a6e0aebccd 100644
--- a/configs/recognition/i3d/README.md
+++ b/configs/recognition/i3d/README.md
@@ -51,7 +51,7 @@ python tools/train.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-3
     --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -68,7 +68,7 @@ python tools/test.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/mvit/README.md b/configs/recognition/mvit/README.md
index 15f8723615..33527c8408 100644
--- a/configs/recognition/mvit/README.md
+++ b/configs/recognition/mvit/README.md
@@ -92,7 +92,7 @@ python tools/test.py configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/omnisource/README.md b/configs/recognition/omnisource/README.md
index 64acf52c35..f3397d3bb1 100644
--- a/configs/recognition/omnisource/README.md
+++ b/configs/recognition/omnisource/README.md
@@ -47,7 +47,7 @@ python tools/train.py configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-25
 
 We found that the training of this Omnisource model could crash for unknown reasons. If this happens, you can resume training by adding the `--cfg-options resume=True` to the training script.
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -64,7 +64,7 @@ python tools/test.py configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/r2plus1d/README.md b/configs/recognition/r2plus1d/README.md
index 29a619e696..d9e216f41a 100644
--- a/configs/recognition/r2plus1d/README.md
+++ b/configs/recognition/r2plus1d/README.md
@@ -45,7 +45,7 @@ python tools/train.py configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_
     --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -62,7 +62,7 @@ python tools/test.py configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_k
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/slowfast/README.md b/configs/recognition/slowfast/README.md
index 3bf1666152..0cd2ccd8d3 100644
--- a/configs/recognition/slowfast/README.md
+++ b/configs/recognition/slowfast/README.md
@@ -48,7 +48,7 @@ python tools/train.py configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e
     --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -65,7 +65,7 @@ python tools/test.py configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/slowonly/README.md b/configs/recognition/slowonly/README.md
index bf5ce3781d..78a3e043e3 100644
--- a/configs/recognition/slowonly/README.md
+++ b/configs/recognition/slowonly/README.md
@@ -57,7 +57,7 @@ python tools/train.py configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256
     --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -74,7 +74,7 @@ python tools/test.py configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/swin/README.md b/configs/recognition/swin/README.md
index 1e6074c4a9..1156c4a679 100644
--- a/configs/recognition/swin/README.md
+++ b/configs/recognition/swin/README.md
@@ -55,7 +55,7 @@ python tools/train.py configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8
     --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -72,7 +72,7 @@ python tools/test.py configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/tanet/README.md b/configs/recognition/tanet/README.md
index 1a67a40aa0..a72a7bde4f 100644
--- a/configs/recognition/tanet/README.md
+++ b/configs/recognition/tanet/README.md
@@ -55,7 +55,7 @@ python tools/train.py configs/recognition/tanet/tanet_imagenet-pretrained-r50_8x
     --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -72,7 +72,7 @@ python tools/test.py configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/timesformer/README.md b/configs/recognition/timesformer/README.md
index df197e0ba9..6d8e148bd8 100644
--- a/configs/recognition/timesformer/README.md
+++ b/configs/recognition/timesformer/README.md
@@ -47,7 +47,7 @@ python tools/train.py configs/recognition/timesformer/timesformer_divST_8xb8-8x3
     --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -64,7 +64,7 @@ python tools/test.py configs/recognition/timesformer/timesformer_divST_8xb8-8x32
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/tin/README.md b/configs/recognition/tin/README.md
index 17a30d7b03..abadd02f4f 100644
--- a/configs/recognition/tin/README.md
+++ b/configs/recognition/tin/README.md
@@ -67,7 +67,7 @@ python tools/train.py configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1
     --work-dir work_dirs/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb randomness.seed=0 randomness.deterministic=True
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -84,7 +84,7 @@ python tools/test.py configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x
     checkpoints/SOME_CHECKPOINT.pth --dump result.json
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/tpn/README.md b/configs/recognition/tpn/README.md
index 20a488ccb1..cb1af4b6b2 100644
--- a/configs/recognition/tpn/README.md
+++ b/configs/recognition/tpn/README.md
@@ -58,7 +58,7 @@ python tools/train.py configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_k
     --work-dir work_dirs/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb [--validate --seed 0 --deterministic]
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -75,7 +75,7 @@ python tools/test.py configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_ki
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/trn/README.md b/configs/recognition/trn/README.md
index 875207dd43..323398acb4 100644
--- a/configs/recognition/trn/README.md
+++ b/configs/recognition/trn/README.md
@@ -52,7 +52,7 @@ python tools/train.py configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-
     --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -69,7 +69,7 @@ python tools/test.py configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md
index 5e5162de83..97c1b33e34 100644
--- a/configs/recognition/tsm/README.md
+++ b/configs/recognition/tsm/README.md
@@ -58,7 +58,7 @@ python tools/train.py configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-
      --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -75,7 +75,7 @@ python tools/test.py configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/tsn/README.md b/configs/recognition/tsn/README.md
index 1b6e34fdc1..61a65ace30 100644
--- a/configs/recognition/tsn/README.md
+++ b/configs/recognition/tsn/README.md
@@ -73,7 +73,7 @@ python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-
     --seed=0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -90,7 +90,7 @@ python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
index 3bea4b9ca7..d48b403c02 100644
--- a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
@@ -58,7 +58,7 @@
 ]
 
 train_dataloader = dict(
-    batch_size=32,
+    batch_size=4,
     num_workers=8,
     persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=True),
@@ -99,4 +99,15 @@
 #   - `enable` means enable scaling LR automatically
 #       or not by default.
 #   - `base_batch_size` = (8 GPUs) x (32 samples per GPU).
-auto_scale_lr = dict(enable=False, base_batch_size=256)
+auto_scale_lr = dict(enable=True, base_batch_size=256)
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=3)
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=10,
+        by_epoch=True,
+        milestones=[4, 8],
+        gamma=0.1)
+]
diff --git a/configs/recognition/uniformer/README.md b/configs/recognition/uniformer/README.md
index 65c224ecc3..ff19fb4fb9 100644
--- a/configs/recognition/uniformer/README.md
+++ b/configs/recognition/uniformer/README.md
@@ -51,7 +51,7 @@ python tools/test.py configs/recognition/uniformer/uniformer-small_imagenet1k-pr
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/uniformerv2/README.md b/configs/recognition/uniformerv2/README.md
index c69b69a662..73855f13f0 100644
--- a/configs/recognition/uniformerv2/README.md
+++ b/configs/recognition/uniformerv2/README.md
@@ -93,7 +93,7 @@ python tools/test.py configs/recognition/uniformerv2/uniformerv2-base-p16-res224
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/videomae/README.md b/configs/recognition/videomae/README.md
index 65b353aff1..16cffc4840 100644
--- a/configs/recognition/videomae/README.md
+++ b/configs/recognition/videomae/README.md
@@ -47,7 +47,7 @@ python tools/test.py configs/recognition/videomae/vit-base-p16_videomae-k400-pre
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition/x3d/README.md b/configs/recognition/x3d/README.md
index a0b9a6f3f4..88d4be33e5 100644
--- a/configs/recognition/x3d/README.md
+++ b/configs/recognition/x3d/README.md
@@ -47,7 +47,7 @@ python tools/test.py configs/recognition/x3d/x3d_s_13x6x1_facebook-kinetics400-r
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/recognition_audio/resnet/README.md b/configs/recognition_audio/resnet/README.md
index be036d149e..f74f5c6ccc 100644
--- a/configs/recognition_audio/resnet/README.md
+++ b/configs/recognition_audio/resnet/README.md
@@ -46,7 +46,7 @@ python tools/train.py configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100
     --cfg-options randomness.seed=0 randomness.deterministic=True
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -63,7 +63,7 @@ python tools/test.py configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/skeleton/2s-agcn/README.md b/configs/skeleton/2s-agcn/README.md
index c61b6fe4e3..69ac0d5526 100644
--- a/configs/skeleton/2s-agcn/README.md
+++ b/configs/skeleton/2s-agcn/README.md
@@ -41,7 +41,7 @@ In skeleton-based action recognition, graph convolutional networks (GCNs), which
 |                         | four-stream  |      |          |  90.89   |                  |       |        |                                           |                                         |                                        |
 
 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size.
-2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/user_guides/useful_tools.md#multi-stream-fusion).
+2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/advanced_guides/useful_tools.md#multi-stream-fusion).
 
 ## Train
 
@@ -58,7 +58,7 @@ python tools/train.py configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu6
     --seed 0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -75,7 +75,7 @@ python tools/test.py configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/skeleton/posec3d/README.md b/configs/skeleton/posec3d/README.md
index 0e45528345..93b526e5ac 100644
--- a/configs/skeleton/posec3d/README.md
+++ b/configs/skeleton/posec3d/README.md
@@ -101,7 +101,7 @@ python tools/train.py configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-k
 
 For training with your custom dataset, you can refer to [Custom Dataset Training](/configs/skeleton/posec3d/custom_dataset_training.md).
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -118,7 +118,7 @@ python tools/test.py configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-ke
     checkpoints/SOME_CHECKPOINT.pth
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/skeleton/posec3d/custom_dataset_training.md b/configs/skeleton/posec3d/custom_dataset_training.md
index cb5b2f647f..81fc1cb3e1 100644
--- a/configs/skeleton/posec3d/custom_dataset_training.md
+++ b/configs/skeleton/posec3d/custom_dataset_training.md
@@ -2,7 +2,7 @@
 
 We provide a step-by-step tutorial on how to train your custom dataset with PoseC3D.
 
-1. First, you should know that action recognition with PoseC3D requires skeleton information only and for that you need to prepare your custom annotation files (for training and validation). To start with, you need to replace the placeholder `mmdet_root` and `mmpose_root` in `ntu_pose_extraction.py` with your installation path. Then you need to take advantage of [ntu_pose_extraction.py](https://github.com/open-mmlab/mmaction2/blob/90fc8440961987b7fe3ee99109e2c633c4e30158/tools/data/skeleton/ntu_pose_extraction.py) as shown in [Prepare Annotations](https://github.com/open-mmlab/mmaction2/blob/master/tools/data/skeleton/README.md#prepare-annotations) to extract 2D keypoints for each video in your custom dataset. The command looks like (assuming the name of your video is `some_video_from_my_dataset.mp4`):
+1. First, you should know that action recognition with PoseC3D requires skeleton information only and for that you need to prepare your custom annotation files (for training and validation). To start with, you need to install MMDetection and MMPose. Then you need to take advantage of [ntu_pose_extraction.py](https://github.com/open-mmlab/mmaction2/blob/90fc8440961987b7fe3ee99109e2c633c4e30158/tools/data/skeleton/ntu_pose_extraction.py) as shown in [Prepare Annotations](https://github.com/open-mmlab/mmaction2/blob/master/tools/data/skeleton/README.md#prepare-annotations) to extract 2D keypoints for each video in your custom dataset. The command looks like (assuming the name of your video is `some_video_from_my_dataset.mp4`):
 
    ```shell
    # You can use the above command to generate pickle files for all of your training and validation videos.
diff --git a/configs/skeleton/stgcn/README.md b/configs/skeleton/stgcn/README.md
index dee9f46dfb..c8d23a1a05 100644
--- a/configs/skeleton/stgcn/README.md
+++ b/configs/skeleton/stgcn/README.md
@@ -63,7 +63,7 @@ Dynamics of human body skeletons convey significant information for human action
 |                         | four-stream  |      |          |  86.19   |                  |       |        |                                           |                                         |                                        |
 
 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size.
-2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/user_guides/useful_tools.md#multi-stream-fusion).
+2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/advanced_guides/useful_tools.md#multi-stream-fusion).
 
 ## Train
 
@@ -80,7 +80,7 @@ python tools/train.py configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xs
     --seed 0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -97,7 +97,7 @@ python tools/test.py configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsu
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/configs/skeleton/stgcnpp/README.md b/configs/skeleton/stgcnpp/README.md
index 655b067a60..3eec28036c 100644
--- a/configs/skeleton/stgcnpp/README.md
+++ b/configs/skeleton/stgcnpp/README.md
@@ -35,7 +35,7 @@ We present PYSKL: an open-source toolbox for skeleton-based action recognition b
 |                         | four-stream  |      |          |  91.87   |                  |       |        |                                           |                                         |                                        |
 
 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size.
-2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/user_guides/useful_tools.md#multi-stream-fusion).
+2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/advanced_guides/useful_tools.md#multi-stream-fusion).
 
 ## Train
 
@@ -52,7 +52,7 @@ python tools/train.py configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu6
     --seed 0 --deterministic
 ```
 
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Test
 
@@ -69,7 +69,7 @@ python tools/test.py configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60
     checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
 ```
 
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
 
 ## Citation
 
diff --git a/docs/en/advanced_guides/customize_dataset.md b/docs/en/advanced_guides/customize_dataset.md
new file mode 100644
index 0000000000..31a6e16b2b
--- /dev/null
+++ b/docs/en/advanced_guides/customize_dataset.md
@@ -0,0 +1,122 @@
+# Customize Datasets
+
+In this tutorial, we will introduce some methods about how to customize your own dataset by online conversion.
+
+- [Customize Datasets](#customize-datasets)
+  - [General understanding of the Dataset in MMAction2](#general-understanding-of-the-dataset-in-mmaction2)
+  - [Customize new datasets](#customize-new-datasets)
+  - [Customize keypoint format for PoseDataset](#customize-keypoint-format-for-posedataset)
+
+## General understanding of the Dataset in MMAction2
+
+MMAction2 provides specific Dataset class according to the task, e.g. `VideoDataset`/`RawframeDataset` for action recognition, `AVADataset` for spatio-temporal action detection, `PoseDataset` for skeleton-based action recognition. All these specific datasets only need to implement `get_data_info(self, idx)` to build a data list from the annotation file, while other functions are handled by the superclass. The following table shows the inherent relationship and the main function of the modules.
+
+| Class Name                   | Functions                                                                                                                                                                    |
+| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| MMAction2::VideoDataset      | `load_data_list(self)` <br> Build data list from the annotation file.                                                                                                        |
+| MMAction2::BaseActionDataset | `get_data_info(self, idx)` <br> Given the `idx`, return the corresponding data sample from data list                                                                         |
+| MMEngine::BaseDataset        | `__getitem__(self, idx)` <br> Given the `idx`, call `get_data_info` to get data sample, then call the `pipeline` to perform transforms and augmentation in `train_pipeline` or `val_pipeline` |
+
+## Customize new datasets
+
+For most scenarios, we don't need to customize a new dataset class, offline conversion is recommended way to use your data. But customizing a new dataset class is also easy in MMAction2. As above mentioned, a dataset for a specific task usually only needs to implement `load_data_list(self)` to generate the data list from the annotation file. It is worth noting that elements in the `data_list` are `dict` with fields required in the following pipeline.
+
+Take `VideoDataset` as an example, `train_pipeline`/`val_pipeline` requires `'filename'` in `DecordInit` and `'label'` in `PackActionInput`, so data samples in the data list have 2 fields: `'filename'` and `'label'`.
+you can refer to [customize pipeline](customize_pipeline.md) for more details about the pipeline.
+
+```
+data_list.append(dict(filename=filename, label=label))
+```
+
+While `AVADataset` is more complex, elements in the data list consist of several fields about video data, and it further overwrites `get_data_info(self, idx)` to convert keys, which are required in spatio-temporal action detection pipeline.
+
+```python
+
+class AVADataset(BaseActionDataset):
+  ...
+
+  def load_data_list(self) -> List[dict]:
+      ...
+        video_info = dict(
+            frame_dir=frame_dir,
+            video_id=video_id,
+            timestamp=int(timestamp),
+            img_key=img_key,
+            shot_info=shot_info,
+            fps=self._FPS,
+            ann=ann)
+            data_list.append(video_info)
+        data_list.append(video_info)
+      return data_list
+
+  def get_data_info(self, idx: int) -> dict:
+      ...
+      ann = data_info.pop('ann')
+      data_info['gt_bboxes'] = ann['gt_bboxes']
+      data_info['gt_labels'] = ann['gt_labels']
+      data_info['entity_ids'] = ann['entity_ids']
+      return data_info
+```
+
+## Customize keypoint format for PoseDataset
+
+MMAction2 currently supports three kinds of keypoint formats: `coco`, `nturgb+d` and `openpose`. If your use one of them, just specify the corresponding format in the following modules:
+
+For Graph Convolutional Networks, such as AAGCN, STGCN...
+
+- transform: argument `dataset` in `JointToBone`.
+- backbone: argument `graph_cfg` in Graph Convolutional Networks.
+
+And for PoseC3D:
+
+- transform: In `Flip`, specify `left_kp` and `right_kp` according to the keypoint symmetrical relationship, or remove the transform for asymmetric keypoints structure.
+- transform: In `GeneratePoseTarget`, specify `skeletons`, `left_limb`, `right_limb` if `with_limb` is `true`, and `left_kp`, `right_kp` if `with_kp` is `true`.
+
+For a custom format, you need to add a new graph layout into models and transforms, which defines the keypoints and their connection relationship.
+
+Take the coco dataset as an example, we define a layout named `coco` in `Graph`, and set its `inward` as followed, which includes all connections between nodes, each connection is a pair of nodes from far to near. The order of connections does not matter. Other settings about coco are to set the number of nodes to 17, and set node 0 as the center node.
+
+```python
+
+self.num_node = 17
+self.inward = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 5),
+                (12, 6), (9, 7), (7, 5), (10, 8), (8, 6), (5, 0),
+                (6, 0), (1, 0), (3, 1), (2, 0), (4, 2)]
+self.center = 0
+```
+
+Similarly, we define the `pairs` in `JointToBone`, adding a bone of `(0, 0)` to align the number of bones to the nodes. The `pairs` of coco dataset is as followed, same as above mentioned, the order of pairs does not matter.
+
+```python
+
+self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 0),
+              (6, 0), (7, 5), (8, 6), (9, 7), (10, 8), (11, 0),
+              (12, 0), (13, 11), (14, 12), (15, 13), (16, 14))
+```
+
+For your custom format, just define the above setting as your graph structure, and specify in your config file as followed, we take `STGCN` as an example, assuming you already define a `custom_dataset` in `Graph` and `JointToBone`, and num_classes is n.
+
+```python
+
+model = dict(
+  type='RecognizerGCN',
+  backbone=dict(
+      type='STGCN', graph_cfg=dict(layout='custom_dataset', mode='stgcn_spatial')),
+  cls_head=dict(type='GCNHead', num_classes=n, in_channels=256))
+
+train_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+val_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+test_pipeline = [
+  ...
+  dict(type='GenSkeFeat', dataset='custom_dataset'),
+  ...]
+
+```
diff --git a/docs/en/advanced_guides/customize_logging.md b/docs/en/advanced_guides/customize_logging.md
new file mode 100644
index 0000000000..aabaad949f
--- /dev/null
+++ b/docs/en/advanced_guides/customize_logging.md
@@ -0,0 +1,163 @@
+# Customize Logging
+
+MMAction2 produces a lot of logs during the running process, such as loss, iteration time, learning rate, etc. In this section, we will introduce you how to output custom log. More details about the logging system, please refer to [MMEngine](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/logging.html).
+
+- [Customize Logging](#customize-logging)
+  - [Flexible Logging System](#flexible-logging-system)
+  - [Customize log](#customize-log)
+  - [Export the debug log](#export-the-debug-log)
+
+## Flexible Logging System
+
+MMAction2 configures the logging system by LogProcessor in [default_runtime](/configs/_base_/default_runtime.py) in default, which is equivalent to:
+
+```python
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+```
+
+Defaultly, LogProcessor catches all filed start with `loss` return by `model.forward`. For example in the following model, `loss1` and `loss2` will be logged automatically without additional configuration.
+
+```python
+from mmengine.model import BaseModel
+
+class ToyModel(BaseModel):
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear = nn.Linear(1, 1)
+
+    def forward(self, img, label, mode):
+        feat = self.linear(img)
+        loss1 = (feat - label).pow(2)
+        loss2 = (feat - label).abs()
+        return dict(loss1=loss1, loss2=loss2)
+```
+
+The format of the output log is as followed:
+
+```
+08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][10/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0019  data_time: 0.0004  loss1: 0.8381  loss2: 0.9007  loss: 1.7388
+08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][20/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0029  data_time: 0.0010  loss1: 0.1978  loss2: 0.4312  loss: 0.6290
+```
+
+LogProcessor will output the log in the following format:
+
+- The prefix of the log:
+  - epoch mode(`by_epoch=True`): `Epoch(train) [{current_epoch}/{current_iteration}]/{dataloader_length}`
+  - iteration mode(`by_epoch=False`): `Iter(train) [{current_iteration}/{max_iteration}]`
+- Learning rate (`lr`): The learning rate of the last iteration.
+- Time:
+  - `time`: The averaged time for inference of the last `window_size` iterations.
+  - `data_time`: The averaged time for loading data of the last `window_size` iterations.
+  - `eta`: The estimated time of arrival to finish the training.
+- Loss: The averaged loss output by model of the last `window_size` iterations.
+
+```{warning}
+log_processor outputs the epoch based log by default(`by_epoch=True`). To get an expected log matched with the `train_cfg`, we should set the same value for `by_epoch` in `train_cfg` and `log_processor`.
+```
+
+Based on the rules above, the code snippet will count the average value of the loss1 and the loss2 every 20 iterations. More types of statistical methods, please refer to [MMEngine.LogProcessor](mmengine.runner.LogProcessor).
+
+## Customize log
+
+The logging system could not only log the loss, lr, .etc but also collect and output the custom log. For example, if we want to statistic the intermediate loss:
+
+The `ToyModel` calculate `loss_tmp` in forward, but don't save it into the return dict.
+
+```python
+from mmengine.logging import MessageHub
+
+class ToyModel(BaseModel):
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear = nn.Linear(1, 1)
+
+    def forward(self, img, label, mode):
+        feat = self.linear(img)
+        loss_tmp = (feat - label).abs()
+        loss = loss_tmp.pow(2)
+
+        message_hub = MessageHub.get_current_instance()
+        # update the intermediate `loss_tmp` in the message hub
+        message_hub.update_scalar('train/loss_tmp', loss_tmp.sum())
+        return dict(loss=loss)
+```
+
+Add the `loss_tmp` into the config:
+
+```python
+log_processor = dict(
+    type='LogProcessor',
+    window_size=20,
+    by_epoch=True,
+    custom_cfg=[
+        # statistic the loss_tmp with the averaged value
+            dict(
+                data_src='loss_tmp',
+                window_size=20,
+                method_name='mean')
+        ])
+```
+
+The `loss_tmp` will be added to the output log:
+
+```
+08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][10/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0026  data_time: 0.0008  loss_tmp: 0.0097  loss: 0.0000
+08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][20/25]  lr: 1.0000e-02  eta: 0:00:00  time: 0.0028  data_time: 0.0013  loss_tmp: 0.0065  loss: 0.0000
+```
+
+## Export the debug log
+
+To export the debug log to the `work_dir`, you can set log_level in config file as followed:
+
+```
+log_level='DEBUG'
+```
+
+```
+08/21 18:16:22 - mmengine - DEBUG - Get class `LocalVisBackend` from "vis_backend" registry in "mmengine"
+08/21 18:16:22 - mmengine - DEBUG - An `LocalVisBackend` instance is built from registry, its implementation can be found in mmengine.visualization.vis_backend
+08/21 18:16:22 - mmengine - DEBUG - Get class `RuntimeInfoHook` from "hook" registry in "mmengine"
+08/21 18:16:22 - mmengine - DEBUG - An `RuntimeInfoHook` instance is built from registry, its implementation can be found in mmengine.hooks.runtime_info_hook
+08/21 18:16:22 - mmengine - DEBUG - Get class `IterTimerHook` from "hook" registry in "mmengine"
+...
+```
+
+Besides, logs of different ranks will be saved in `debug` mode if you are training your model with the shared storage. The hierarchy of the log is as follows:
+
+```text
+./tmp
+├── tmp.log
+├── tmp_rank1.log
+├── tmp_rank2.log
+├── tmp_rank3.log
+├── tmp_rank4.log
+├── tmp_rank5.log
+├── tmp_rank6.log
+└── tmp_rank7.log
+...
+└── tmp_rank63.log
+```
+
+The log of Multiple machines with independent storage:
+
+```text
+# device: 0:
+work_dir/
+└── exp_name_logs
+    ├── exp_name.log
+    ├── exp_name_rank1.log
+    ├── exp_name_rank2.log
+    ├── exp_name_rank3.log
+    ...
+    └── exp_name_rank7.log
+
+# device: 7:
+work_dir/
+└── exp_name_logs
+    ├── exp_name_rank56.log
+    ├── exp_name_rank57.log
+    ├── exp_name_rank58.log
+    ...
+    └── exp_name_rank63.log
+```
diff --git a/docs/en/advanced_guides/customize_models.md b/docs/en/advanced_guides/customize_models.md
new file mode 100644
index 0000000000..3d8c0e1d4e
--- /dev/null
+++ b/docs/en/advanced_guides/customize_models.md
@@ -0,0 +1 @@
+# Customize Models
diff --git a/docs/en/advanced_guides/customize_optimizer.md b/docs/en/advanced_guides/customize_optimizer.md
index d69aa0ff90..d862b9632c 100644
--- a/docs/en/advanced_guides/customize_optimizer.md
+++ b/docs/en/advanced_guides/customize_optimizer.md
@@ -4,8 +4,19 @@ In this tutorial, we will introduce some methods about how to build the optimize
 
 - [Customize Optimizer](#customize-optimizer)
   - [Build optimizers using optim_wrapper](#build-optimizers-using-optim_wrapper)
+    - [Use optimizers supported by PyTorch](#use-optimizers-supported-by-pytorch)
+    - [Parameter-wise finely configuration](#parameter-wise-finely-configuration)
+    - [Gradient clipping](#gradient-clipping)
+    - [Gradient accumulation](#gradient-accumulation)
   - [Customize parameter schedules](#customize-parameter-schedules)
+    - [Customize learning rate schedules](#customize-learning-rate-schedules)
+    - [Customize momentum schedules](#customize-momentum-schedules)
   - [Add new optimizers or constructors](#add-new-optimizers-or-constructors)
+    - [Add new optimizers](#add-new-optimizers)
+      - [1. Implement a new optimizer](#1-implement-a-new-optimizer)
+      - [2. Import the optimizer](#2-import-the-optimizer)
+      - [3. Specify the optimizer in the config file](#3-specify-the-optimizer-in-the-config-file)
+    - [Add new optimizer constructors](#add-new-optimizer-constructors)
 
 ## Build optimizers using optim_wrapper
 
diff --git a/docs/en/advanced_guides/customize_pipeline.md b/docs/en/advanced_guides/customize_pipeline.md
index 719f806d3f..632216ba10 100644
--- a/docs/en/advanced_guides/customize_pipeline.md
+++ b/docs/en/advanced_guides/customize_pipeline.md
@@ -3,8 +3,11 @@
 In this tutorial, we will introduce some methods about how to build the data pipeline (i.e., data transformations)for your tasks.
 
 - [Customize Data Pipeline](#customize-data-pipeline)
-  - [Design of Dataset and Data pipelines](#design-of-dataset-and-data-pipelines)
-  - [Modify the training/test pipeline](#modify-the-training/test-pipeline)
+  - [Design of Data pipelines](#design-of-data-pipelines)
+  - [Modify the training/test pipeline](#modify-the-trainingtest-pipeline)
+    - [Loading](#loading)
+    - [Sampling frames and other processing](#sampling-frames-and-other-processing)
+    - [Formatting](#formatting)
   - [Add new data transforms](#add-new-data-transforms)
 
 ## Design of Data pipelines
diff --git a/docs/en/advanced_guides/dataflow.md b/docs/en/advanced_guides/dataflow.md
new file mode 100644
index 0000000000..0cc136162a
--- /dev/null
+++ b/docs/en/advanced_guides/dataflow.md
@@ -0,0 +1 @@
+# Dataflow in MMAction2
diff --git a/docs/en/advanced_guides/depoly.md b/docs/en/advanced_guides/depoly.md
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/en/notes/contribution_guide.md b/docs/en/get_started/contribution_guide.md
similarity index 93%
rename from docs/en/notes/contribution_guide.md
rename to docs/en/get_started/contribution_guide.md
index f9d96c75a5..02f2aa35d4 100644
--- a/docs/en/notes/contribution_guide.md
+++ b/docs/en/get_started/contribution_guide.md
@@ -1,10 +1,11 @@
-# Contributing to MMAction2
+# How to contribute to MMAction2
 
 All kinds of contributions are welcome, including but not limited to the following.
 
 - Fixes (typo, bugs)
 - New features and components
 - Add documentation or translate the documentation into other languages
+- Add new project (Recommended) about video understanding algorithm with less restriction, refer to [here](/projects/README.md) for details
 
 ## Workflow
 
diff --git a/docs/en/notes/faq.md b/docs/en/get_started/faq.md
similarity index 99%
rename from docs/en/notes/faq.md
rename to docs/en/get_started/faq.md
index 4f028d5b4c..2cbe7787b3 100644
--- a/docs/en/notes/faq.md
+++ b/docs/en/get_started/faq.md
@@ -88,7 +88,7 @@ If the contents here do not cover your issue, please create an issue using the [
 
 - **How to set `load_from` value in config files to finetune models?**
 
-  In MMAction2, We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](/docs/en/user_guides/1_config.md),
+  In MMAction2, We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](/docs/en/user_guides/config.md),
   users can directly change it by setting `load_from` in their configs.
 
 ## Testing
diff --git a/docs/en/guide_to_framework.md b/docs/en/get_started/guide_to_framework.md
similarity index 100%
rename from docs/en/guide_to_framework.md
rename to docs/en/get_started/guide_to_framework.md
diff --git a/docs/en/get_started.md b/docs/en/get_started/installation.md
similarity index 95%
rename from docs/en/get_started.md
rename to docs/en/get_started/installation.md
index 0f0ac1c5ec..9d48be6030 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started/installation.md
@@ -1,4 +1,6 @@
-# Prerequisites
+# Installation
+
+## Prerequisites
 
 In this section we demonstrate how to prepare an environment with PyTorch.
 
@@ -35,12 +37,10 @@ On CPU platforms:
 conda install pytorch torchvision cpuonly -c pytorch
 ```
 
-# Installation
+## Best Practices
 
 We recommend that users follow our best practices to install MMAction2. However, the whole process is highly customizable. See [Customize Installation](#customize-installation) section for more information.
 
-## Best Practices
-
 **Step 1.** Install [MMEngine](https://github.com/open-mmlab/mmengine) and [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim).
 
 ```shell
@@ -52,10 +52,10 @@ mim install mmengine 'mmcv>=2.0.0rc1'
 
 According to your needs, we support two install modes:
 
-- [Install from source (Recommended)](#install-from-source): You want to develop your own action recognition task or new features on MMAction2 framework. For example, adding new dataset or new models. Thus, you can use all tools we provided.
+- [Install from source (Recommended)](#build-mmaction2-from-source): You want to develop your own action recognition task or new features on MMAction2 framework. For example, adding new dataset or new models. Thus, you can use all tools we provided.
 - [Install as a Python package](#install-as-a-python-package): You just want to call MMAction2's APIs or import MMAction2's modules in your project.
 
-### Install from source
+### Build MMAction2 from source
 
 In this case, install mmaction2 from source:
 
@@ -193,3 +193,7 @@ Run it with
 ```shell
 docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmaction2/data mmaction2
 ```
+
+## Troubleshooting
+
+coming soon...
diff --git a/docs/en/get_started/overview.md b/docs/en/get_started/overview.md
new file mode 100644
index 0000000000..4857991711
--- /dev/null
+++ b/docs/en/get_started/overview.md
@@ -0,0 +1,97 @@
+# Overview
+
+## What is MMAction2
+
+MMAction2 is an open source toolkit based on PyTorch, supporting numerous video understanding models, including action recognition, skeleton-based action recognition, spatio-temporal action detection and temporal action localization. In addition, it supports widely-used academic datasets and provides many useful tools, assisting users in exploring various aspects of models and datasets and implementing high-quality algorithms. Generally, it has the following features.
+
+One-stop, Multi-model: MMAction2 supports various video understanding tasks and implements the latest models for action recognition, localization, detection.
+
+Modular Design: MMAction2’s modular design allows users to define and reuse modules in the model on demand.
+
+Various Useful Tools: MMAction2 provides many analysis tools, including visualizers, validation scripts, evaluators, etc., to help users troubleshoot, finetune or compare models.
+
+Powered by OpenMMLab: Like other algorithm libraries in OpenMMLab family, MMAction2 follows OpenMMLab’s rigorous development guidelines and interface conventions, significantly reducing the learning cost of users familiar with other projects in OpenMMLab family. In addition, benefiting from the unified interfaces among OpenMMLab, you can easily call the models implemented in other OpenMMLab projects (e.g. MMClassification) in MMAction2, facilitating cross-domain research and real-world applications.
+
+<table><tr>
+  <td><img src="https://github.com/open-mmlab/mmaction2/raw/1.x/resources/mmaction2_overview.gif" width="380px">
+    <p style="text-align: center;">Action Recognition</p></td>
+  <td><img src="https://user-images.githubusercontent.com/34324155/123989146-2ecae680-d9fb-11eb-916b-b9db5563a9e5.gif" width="380px"><br>
+    <p style="text-align: center;">Skeleton-based Action Recognition</p></td>
+</table></tr>
+<table><tr>
+  <td><img src="https://user-images.githubusercontent.com/30782254/155710881-bb26863e-fcb4-458e-b0c4-33cd79f96901.gif" width="380px">
+    <p style="text-align: center;">Spatio-Temporal Action Detection</p></td>
+  <td><img src="https://github.com/open-mmlab/mmaction2/raw/1.x/resources/spatio-temporal-det.gif" width="380px"><br>
+    <p style="text-align: center;">Spatio-Temporal Action Detection</p></td>
+</table></tr>
+
+## How to use the documentation
+
+We have prepared a wealth of documents to meet your various needs:
+
+<details open>
+<summary><b>For the basic usage of MMAction2</b></summary>
+
+- [Installation](docs/en/get_started/installation.md)
+- [Quick Run](docs/en/get_started/quick_run.md)
+- [Inference](docs/en/user_guides/Inference.md)
+
+</details>
+
+<details open>
+<summary><b>For training on supported dataset</b></summary>
+
+- [learn about configs](docs/en/user_guides/config.md)
+- [prepare dataset](docs/en/get_started/prepare_dataset.md)
+- [Training and testing](docs/en/user_guides/train_test.md)
+
+</details>
+
+<details open>
+<summary><b>For looking for some common issues</b></summary>
+
+- [FAQs](docs/en/get_started/faq.md)
+- [Useful tools](docs/en/useful_tools.md)
+
+</details>
+
+<details open>
+<summary><b>For a general understanding about MMAction2</b></summary>
+
+- [20-minute tour to MMAction2](docs/en/get_started/20-minute_tour.md)
+- [Data flow in MMAction2](docs/en/advanced_guides/dataflow.md)
+
+</details>
+
+<details open>
+<summary><b>For advanced usage about custom training</b></summary>
+
+- [Customize models](docs/en/advanced_guides/customize_models.md)
+- [Customize datasets](docs/en/advanced_guides/customize_dataset.md)
+- [Customize data transformation and augmentation](docs/en/advanced_guides/customize_pipeline.md)
+- [Customize optimizer and scheduler](docs/en/advanced_guides/customize_optimizer.md)
+- [Customize logging](docs/en/advanced_guides/customize_logging.md)
+
+</details>
+
+<details open>
+<summary><b>For supported model zoo and dataset zoo</b></summary>
+
+- [Model Zoo](model_zoo/modelzoo.md)
+- [Dataset Zoo](datasetzoo.md)
+
+</details>
+
+<details open>
+<summary><b>For migration from MMAction2 0.x</b></summary>
+
+- [Migration](migration.md)
+
+</details>
+
+<details open>
+<summary><b>For researchers and developers who are willing to contribute to MMAction2</b></summary>
+
+- [Contribution Guide](get_started/contribution_guide.md)
+
+</details>
diff --git a/docs/en/get_started/quick_run.md b/docs/en/get_started/quick_run.md
new file mode 100644
index 0000000000..84ae5b985f
--- /dev/null
+++ b/docs/en/get_started/quick_run.md
@@ -0,0 +1,221 @@
+# Quick Run
+
+This chapter will take you through the basic functions of MMAction2. And we assume you [installed MMAction2 from source](../installation#best-practices).
+
+- [Quick Run](#quick-run)
+  - [Inference](#inference)
+  - [Prepare a Dataset](#prepare-a-dataset)
+  - [Modify the Config](#modify-the-config)
+    - [Modify Dataset](#modify-dataset)
+    - [Modify Runtime Config](#modify-runtime-config)
+    - [Modify Model Config](#modify-model-config)
+  - [Browse the Dataset](#browse-the-dataset)
+  - [Training](#training)
+  - [Testing](#testing)
+
+## Inference
+
+Run the following in MMAction2's root directory:
+
+```shell
+python demo/demo_inferencer.py  demo/demo.mp4 \
+    --rec tsn --print-result \
+    --label-file tools/data/kinetics/label_map_k400.txt
+```
+
+You should be able to see a pop-up video and the inference result printed out in the console.
+
+<div align="center">
+    <img src="https://user-images.githubusercontent.com/33249023/227216933-29b84ac7-ca0e-408d-b4d2-5a2e5a7357bf.gif" height="250"/>
+</div>
+<br />
+
+```bash
+# Inference result
+{'predictions': [{'rec_labels': [[6]], 'rec_scores': [[...]]}]}
+```
+
+```{note}
+If you are running MMAction2 on a server without GUI or via SSH tunnel with X11 forwarding disabled, you may not see the pop-up window.
+```
+
+A detailed description of MMAction2's inference interface can be found [here](/demo/README#inferencer)
+
+In addition to using our well-provided pre-trained models, you can also train models on your own datasets. In the next section, we will take you through the basic functions of MMAction2 by training TSN on the tiny [Kinetics](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) dataset as an example.
+
+## Prepare a Dataset
+
+Since the variety of video dataset formats are not conducive to switching datasets, MMAction2 proposes a uniform [data format](../user_guides/2_data_prepare.md), and provides [dataset preparer](../user_guides/data_prepare/dataset_preparer.md) for commonly used video datasets. Usually, to use those datasets in MMAction2, you just need to follow the steps to get them ready for use.
+
+```{note}
+But here, efficiency means everything.
+```
+
+Here, we have prepared a lite version of Kinetics dataset for demonstration purposes. Download our pre-prepared [zip](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) and extract it to the `data/` directory under mmaction2 to get our prepared video and annotation file.
+
+```Bash
+wget https://download.openmmlab.com/mmaction/kinetics400_tiny.zip
+mkdir -p data/
+unzip kinetics400_tiny.zip -d data/
+```
+
+## Modify the Config
+
+Once the dataset is prepared, we will then specify the location of the training set and the training parameters by modifying the config file.
+
+In this example, we will train a TSN using resnet50 as its backbone. Since MMAction2 already has a config file for the full Kinetics400 dataset (`configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`), we just need to make some modifications on top of it.
+
+### Modify Dataset
+
+We first need to modify the path to the dataset. Open `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` and replace keys as followed:
+
+```Python
+data_root = 'data/kinetics400_tiny/train'
+data_root_val = 'data/kinetics400_tiny/val'
+ann_file_train = 'data/kinetics400_tiny/kinetics_tiny_train_video.txt'
+ann_file_val = 'data/kinetics400_tiny/kinetics_tiny_val_video.txt'
+```
+
+### Modify Runtime Config
+
+Also, because of the reduced dataset size, we'd better reduce training batchsize to 4 and the number of training epochs to 10 accordingly, shorten the validation interval as well as the weight storage interval to 1 rounds, and modify the learning rate decay strategy. Modify corresponding keys in  `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` as following lines to take effect.
+
+```Python
+# set training batch size to 4
+train_dataloader['batch_size'] = 4
+
+# Save checkpoints every epoch, and only keep the latest checkpoint
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=3, max_keep_ckpts=1,),
+    )
+# Set the maximum number of epochs to 10, and validate the model every 3 epochs
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=3)
+# adjust learning rate schedule according to 10 epochs
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=10,
+        by_epoch=True,
+        milestones=[4, 8],
+        gamma=0.1)
+]
+```
+
+### Modify Model Config
+
+Further, due to the small size of tiny kinetics dataset, we'd better to load a pre-trained model on original Kinetics dataset. We also need to modify the model according to the actual number of classes. Just directly put the following lines into `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`.
+
+```Python
+
+model = dict(
+    cls_head=dict(num_classes=2))
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'
+```
+
+Here, we have rewritten the corresponding parameters in the base configuration directly through the inheritance ({external+mmengine:doc}`MMEngine: Config <advanced_tutorials/config>`) mechanism of the config. The original fields are distributed in `configs/_base_/models/tsn_r50.py`, `configs/_base_/schedules/sgd_100e.py` and `configs/_base_/default_runtime.py`.
+
+```{note}
+For a more detailed description of config, please refer to [here](../user_guides/1_config.md).
+```
+
+## Browse the Dataset
+
+Before we start the training, we can also visualize the frames processed by training-time [data transforms](<>). It's quite simple: pass the config file we need to visualize into the [browse_dataset.py](/tools/analysis_tools/browse_dataset.py) script.
+
+```Bash
+python tools/visualizations/browse_dataset.py \
+    configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \
+    browse_out --mode pipeline
+```
+
+The transformed videos will be saved to `browse_out` folder.
+
+<center class="half">
+    <img src="https://user-images.githubusercontent.com/33249023/227452030-81895695-8a9b-45be-922a-3d9d86baf65d.gif" height="250"/>
+</center>
+
+```{note}
+For details on the parameters and usage of this script, please refer to [here](../user_guides/useful_tools.md).
+```
+
+```{tip}
+In addition to satisfying our curiosity, visualization can also help us check the parts that may affect the model's performance before training, such as problems in configs, datasets and data transforms.
+```
+
+we can further visualize the learning rate schedule to make sure that the config is as expected by following script:
+
+```Bash
+python tools/visualizations/vis_scheduler.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+The training learning rate schedule will be displayed in a pop-up window.
+
+<center class="half">
+    <img src="https://user-images.githubusercontent.com/33249023/227502329-6fd44259-e23b-46e0-8e19-29f9b664f4e2.png" height="250"/>
+</center>
+
+```{note}
+The learning rate is auto scaled according to the actual batchsize.
+```
+
+## Training
+
+Start the training by running the following command:
+
+```Bash
+python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+Depending on the system environment, MMAction2 will automatically use the best device for training. If a GPU is available, a single GPU training will be started by default. When you start to see the output of the losses, you have successfully started the training.
+
+```Bash
+03/24 16:36:15 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:15 - mmengine - INFO - Epoch(train)  [1][8/8]  lr: 1.5625e-04  eta: 0:00:15  time: 0.2151  data_time: 0.0845  memory: 1314  grad_norm: 8.5647  loss: 0.7267  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.7267
+03/24 16:36:16 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:16 - mmengine - INFO - Epoch(train)  [2][8/8]  lr: 1.5625e-04  eta: 0:00:12  time: 0.1979  data_time: 0.0717  memory: 1314  grad_norm: 8.4709  loss: 0.7130  top1_acc: 0.0000  top5_acc: 1.0000  loss_cls: 0.7130
+03/24 16:36:18 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:18 - mmengine - INFO - Epoch(train)  [3][8/8]  lr: 1.5625e-04  eta: 0:00:10  time: 0.1691  data_time: 0.0478  memory: 1314  grad_norm: 8.2910  loss: 0.6900  top1_acc: 0.5000  top5_acc: 1.0000  loss_cls: 0.6900
+03/24 16:36:18 - mmengine - INFO - Saving checkpoint at 3 epochs
+03/24 16:36:19 - mmengine - INFO - Epoch(val) [3][1/1]  acc/top1: 0.9000  acc/top5: 1.0000  acc/mean1: 0.9000data_time: 1.2716  time: 1.3658
+03/24 16:36:20 - mmengine - INFO - The best checkpoint with 0.9000 acc/top1 at 3 epoch is saved to best_acc/top1_epoch_3.pth.
+```
+
+Without extra configurations, model weights will be saved to `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/`, while the logs will be stored in `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/TIMESTAMP/`. Next, we just need to wait with some patience for training to finish.
+
+```{note}
+For advanced usage of training, such as CPU training, multi-GPU training, and cluster training, please refer to [Training and Testing](../user_guides/train_test.md).
+```
+
+## Testing
+
+After 10 epochs, we observe that TSN performs best in the 6th epoch, with `acc/top1` reaching 1.0000:
+
+```Bash
+03/24 16:36:25 - mmengine - INFO - Epoch(val) [6][1/1]  acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 1.0000data_time: 1.0210  time: 1.1091
+```
+
+```{note}
+The result is pretty high due to pre-trained on original Kinetics400, you may see a different result.
+```
+
+However, this value only reflects the validation performance of TSN on the mini Kinetics dataset, While test results are usually higher due to more augmentation in test pipeline.
+
+Start testing:
+
+```Bash
+python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \
+    work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/best_acc/top1_epoch_6.pth
+```
+
+And get the outputs like:
+
+```Bash
+03/24 17:00:59 - mmengine - INFO - Epoch(test) [10/10]  acc/top1: 1.0000  acc/top5: 1.0000  acc/mean1: 0.9000data_time: 0.0420  time: 1.0795
+```
+
+The model achieves an hmean of 1.0000 on this dataset.
+
+```{note}
+For advanced usage of testing, such as CPU testing, multi-GPU testing, and cluster testing, please refer to [Training and Testing](../user_guides/train_test.md).
+```
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 392b64ef45..73a4590f00 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -7,24 +7,38 @@ You can switch between Chinese and English documents in the lower-left corner of
    :maxdepth: 1
    :caption: Get Started
 
-   get_started.md
-   guide_to_framework.md
+   get_started/overview.md
+   get_started/installation.md
+   get_started/quick_run.md
+   get_started/guide_to_framework.md
+   get_started/contribution_guide.md
+   get_started/faq.md
 
 .. toctree::
    :maxdepth: 1
    :caption: User Guides
 
-   user_guides/1_config.md
-   user_guides/2_data_prepare.md
-   user_guides/3_inference.md
-   user_guides/4_train_test.md
+   user_guides/Inference.md
+   user_guides/config.md
+   user_guides/train_test.md
+   user_guides/prepare_dataset.md
 
 .. toctree::
    :maxdepth: 1
-   :caption: Useful Tools
+   :caption: Advanced Guides
 
-   user_guides/useful_tools.md
-   user_guides/visualization.md
+   advanced_guides/dataflow.md
+   advanced_guides/customize_models.md
+   advanced_guides/customize_dataset.md
+   advanced_guides/customize_pipeline.md
+   advanced_guides/customize_optimizer.md
+   advanced_guides/customize_logging.md
+   advanced_guides/deploy.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Advanced Guides
+   useful_tools.md
 
 .. toctree::
    :maxdepth: 1
@@ -36,20 +50,32 @@ You can switch between Chinese and English documents in the lower-left corner of
    :maxdepth: 1
    :caption: Model Zoo
 
-   modelzoo.md
-   recognition_models.md
-   detection_models.md
-   skeleton_models.md
-   localization_models.md
+   model_zoo/modelzoo.md
+   model_zoo/recognition_models.md
+   model_zoo/detection_models.md
+   model_zoo/skeleton_models.md
+   model_zoo/localization_models.md
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Dataset Zoo
+
+   datasetzoo_overview.md
+   datasetzoo.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Projects
+
+   projectzoo.md
 
 .. toctree::
    :maxdepth: 1
    :caption: Notes
 
-   notes/contribution_guide.md
-   notes/projects.md
+   notes/ecosystem.md
    notes/changelog.md
-   notes/faq.md
 
 .. toctree::
    :caption: Switch Language
diff --git a/docs/en/merge_docs.sh b/docs/en/merge_docs.sh
index aa2a9bebfd..5a3c86b7ac 100644
--- a/docs/en/merge_docs.sh
+++ b/docs/en/merge_docs.sh
@@ -1,8 +1,45 @@
 #!/usr/bin/env bash
 
-## gather models
-cat  ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > localization_models.md
-cat  ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > recognition_models.md
-cat  ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >> recognition_models.md
-cat  ../../configs/detection/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > detection_models.md
-cat  ../../configs/skeleton/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > skeleton_models.md
+# gather models
+mkdir -p model_zoo
+cat  ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\/en/](../g' |sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/localization_models.md
+cat  ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/recognition_models.md
+cat  ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" >>  model_zoo/recognition_models.md
+cat  ../../configs/detection/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/detection_models.md
+cat  ../../configs/skeleton/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/skeleton_models.md
+
+# gather projects
+# TODO: generate table of contents for project zoo
+cat ../../projects/README.md > projectzoo.md
+cat ../../projects/*/README.md >> projectzoo.md
+
+# gather datasets
+cat supported_datasets.md > datasetzoo.md
+cat  ../../tools/data/*/README.md | sed 's/# Preparing/# /g' | sed 's/#/#&/' >> datasetzoo.md
+
+sed -i 's/(\/tools\/data\/activitynet\/README.md/(#activitynet/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/kinetics\/README.md/(#kinetics-400600700/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/mit\/README.md/(#moments-in-time/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/mmit\/README.md/(#multi-moments-in-time/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/sthv1\/README.md/(#something-something-v1/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/sthv2\/README.md/(#something-something-v2/g' datasetzoo.md
+sed -i "s/(\/tools\/data\/thumos14\/README.md/(#thumos14/g" datasetzoo.md
+sed -i 's/(\/tools\/data\/ucf101\/README.md/(#ucf-101/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/ucf101_24\/README.md/(#ucf101-24/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/jhmdb\/README.md/(#jhmdb/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/hvu\/README.md/(#hvu/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/hmdb51\/README.md/(#hmdb51/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/jester\/README.md/(#jester/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/ava\/README.md/(#ava/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/gym\/README.md/(#gym/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/omnisource\/README.md/(#omnisource/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/diving48\/README.md/(#diving48/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/skeleton\/README.md/(#skeleton-dataset/g' datasetzoo.md
+
+cat prepare_data.md >> datasetzoo.md
+
+sed -i 's/](\/docs\/en\//](g' datasetzoo.md
+sed -i 's/](\/docs\/en\//](g' changelog.md
+
+sed -i 's/](\/docs\/en\//](..g' ./get_stated/*.md
+sed -i 's/](\/docs\/en\//](..g' ./tutorials/*.md
diff --git a/docs/en/notes/projects.md b/docs/en/notes/ecosystem.md
similarity index 98%
rename from docs/en/notes/projects.md
rename to docs/en/notes/ecosystem.md
index f4bc5ac9e6..73b0fd6aaf 100644
--- a/docs/en/notes/projects.md
+++ b/docs/en/notes/ecosystem.md
@@ -1,4 +1,4 @@
-# Projects based on MMAction2
+# Ecosystem Projects based on MMAction2
 
 There are many research works and projects built on MMAction2.
 We list some of them as examples of how to extend MMAction2 for your own projects.
diff --git a/docs/en/notes/pytorch2.0.md b/docs/en/notes/pytorch2.0.md
new file mode 100644
index 0000000000..d50101490b
--- /dev/null
+++ b/docs/en/notes/pytorch2.0.md
@@ -0,0 +1,21 @@
+# PyTorch 2.0 Compatibility and Benchmark
+
+PyTorch introduced `torch.compile` in its 2.0 release. It compiles your model to speedup trainning & validation. We provide a benchmark result and compatibility of typical models in MMAction2. Except for one model (MViT) that fails to compile, the performance of other models remains consistent before and after compilation.
+
+| Config                                                                    | compiled | Train time / iter (s) | GPU memory (M) | test metric  |
+| ------------------------------------------------------------------------- | -------- | --------------------- | -------------- | ------------ |
+| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb                    | False    | 0.50                  | 42537          | 36.55        |
+| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb                    | True     | 0.61                  | 53149          | 36.72        |
+| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb                         | False    | 0.688                 | 14263          | 77.69        |
+| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb                         | True     | 0.691                 | 13863          | 77.57        |
+| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d                          | False    | 0.0305                | 1184           | 91.69        |
+| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d                          | True     | 0.0298                | 1273           | 91.64        |
+| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint                           | False    | 0.498                 | 9581           | 93.6         |
+| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint                           | True     | 0.505                 | 11968          | 93.49        |
+| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb            | False    | 0.17                  | 8278           | 20.76        |
+| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb            | True     | 0.1835                | 12004          | 21.67        |
+| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb          | False    | 0.323                 | 21651          | 78.90        |
+| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb          | True     | 0.262                 | 20905          | 78.70        |
+| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | False    | 0.098                 | 5777           | 75.12        |
+| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | True     | 0.0942                | 7095           | 75.15        |
+| mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb                        | Fail     | incompatible          | incompatible   | incompatible |
diff --git a/docs/en/stat.py b/docs/en/stat.py
index 80263653dc..b07d123fa8 100644
--- a/docs/en/stat.py
+++ b/docs/en/stat.py
@@ -16,7 +16,7 @@ def anchor(name):
 
 # Count algorithms
 
-files = sorted(glob.glob('*_models.md'))
+files = sorted(glob.glob('model_zoo/*_models.md'))
 # files = sorted(glob.glob('docs/*_models.md'))
 
 stats = []
@@ -99,76 +99,76 @@ def anchor(name):
 {msglist}
 """
 
-with open('modelzoo.md', 'w') as f:
+with open('model_zoo/modelzoo.md', 'w') as f:
     f.write(modelzoo)
 
-# # Count datasets
-#
-# files = ['supported_datasets.md']
-# # files = sorted(glob.glob('docs/tasks/*.md'))
-#
-# datastats = []
-#
-# for f in files:
-#     with open(f, 'r') as content_file:
-#         content = content_file.read()
-#
-#     # title
-#     title = content.split('\n')[0].replace('#', '')
-#
-#     # count papers
-#     papers = set(
-#         (papertype, titlecase.titlecase(paper.lower().strip()))
-#         for (papertype, paper) in re.findall(
-#             r'<!--\s*\[([A-Z]*?)\]\s*-->\s*\n.*?\btitle\s*=\s*{(.*?)}',
-#             content, re.DOTALL))
-#     # paper links
-#     revcontent = '\n'.join(list(reversed(content.splitlines())))
-#     paperlinks = {}
-#     for _, p in papers:
-#         print(p)
-#         q = p.replace('\\', '\\\\').replace('?', '\\?')
-#         paperlinks[p] = ', '.join(
-#             (f'[{p.strip()} ->]({splitext(basename(f))[0]}.html#{anchor(p)})'
-#              for p in re.findall(
-#                  rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n',
-#                  revcontent, re.DOTALL | re.IGNORECASE)))
-#         print('   ', paperlinks[p])
-#     paperlist = '\n'.join(
-#         sorted(f'    - [{t}] {x} ({paperlinks[x]})' for t, x in papers))
-#
-#     statsmsg = f"""
-# ## [{title}]({f})
-#
-# * Number of papers: {len(papers)}
-# {paperlist}
-#
-#     """
-#
-#     datastats.append((papers, configs, ckpts, statsmsg))
-#
-# alldatapapers = func.reduce(lambda a, b: a.union(b),
-#                             [p for p, _, _, _ in datastats])
-#
-# # Summarize
-#
-# msglist = '\n'.join(x for _, _, _, x in stats)
-# datamsglist = '\n'.join(x for _, _, _, x in datastats)
-# papertypes, papercounts = np.unique([t for t, _ in alldatapapers],
-#                                     return_counts=True)
-# countstr = '\n'.join(
-#     [f'   - {t}: {c}' for t, c in zip(papertypes, papercounts)])
-#
-# modelzoo = f"""
-# # Overview
-#
-# * Number of papers: {len(alldatapapers)}
-# {countstr}
-#
-# For supported action algorithms, see [modelzoo overview](modelzoo.md).
-#
-# {datamsglist}
-# """
-#
-# with open('datasets.md', 'w') as f:
-#     f.write(modelzoo)
+# Count datasets
+
+files = ['datasetzoo.md']
+# files = sorted(glob.glob('docs/tasks/*.md'))
+
+datastats = []
+
+for f in files:
+    with open(f, 'r') as content_file:
+        content = content_file.read()
+
+    # title
+    title = content.split('\n')[0].replace('#', '')
+
+    # count papers
+    papers = set(
+        (papertype, titlecase.titlecase(paper.lower().strip()))
+        for (papertype, paper) in re.findall(
+            r'<!--\s*\[([A-Z]*?)\]\s*-->\s*\n.*?\btitle\s*=\s*{(.*?)}',
+            content, re.DOTALL))
+    # paper links
+    revcontent = '\n'.join(list(reversed(content.splitlines())))
+    paperlinks = {}
+    for _, p in papers:
+        print(p)
+        q = p.replace('\\', '\\\\').replace('?', '\\?')
+        paperlinks[p] = ', '.join(
+            (f'[{p.strip()} ->]({splitext(basename(f))[0]}.html#{anchor(p)})'
+             for p in re.findall(
+                 rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n',
+                 revcontent, re.DOTALL | re.IGNORECASE)))
+        print('   ', paperlinks[p])
+    paperlist = '\n'.join(
+        sorted(f'    - [{t}] {x} ({paperlinks[x]})' for t, x in papers))
+
+    statsmsg = f"""
+## [{title}]({f})
+
+* Number of papers: {len(papers)}
+{paperlist}
+
+    """
+
+    datastats.append((papers, configs, ckpts, statsmsg))
+
+alldatapapers = func.reduce(lambda a, b: a.union(b),
+                            [p for p, _, _, _ in datastats])
+
+# Summarize
+
+msglist = '\n'.join(x for _, _, _, x in stats)
+datamsglist = '\n'.join(x for _, _, _, x in datastats)
+papertypes, papercounts = np.unique([t for t, _ in alldatapapers],
+                                    return_counts=True)
+countstr = '\n'.join(
+    [f'   - {t}: {c}' for t, c in zip(papertypes, papercounts)])
+
+datasetzoo = f"""
+# Overview
+
+* Number of papers: {len(alldatapapers)}
+{countstr}
+
+For supported action algorithms, see [modelzoo overview](modelzoo.md).
+
+{datamsglist}
+"""
+
+with open('datasetzoo_overview.md', 'w') as f:
+    f.write(datasetzoo)
diff --git a/docs/en/supported_datasets.md b/docs/en/supported_datasets.md
new file mode 100644
index 0000000000..42911fc8ff
--- /dev/null
+++ b/docs/en/supported_datasets.md
@@ -0,0 +1,36 @@
+# Supported Datasets
+
+- Action Recognition
+
+  - [UCF101](/tools/data/ucf101/README.md) \[ [Homepage](https://www.crcv.ucf.edu/research/data-sets/ucf101/) \].
+  - [HMDB51](/tools/data/hmdb51/README.md) \[ [Homepage](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/) \].
+  - [Kinetics-\[400/600/700\]](/tools/data/kinetics/README.md) \[ [Homepage](https://deepmind.com/research/open-source/kinetics) \]
+  - [Something-Something V1](/tools/data/sthv1/README.md) \[ [Homepage](https://20bn.com/datasets/something-something/v1) \]
+  - [Something-Something V2](/tools/data/sthv2/README.md) \[ [Homepage](https://20bn.com/datasets/something-something) \]
+  - [Moments in Time](/tools/data/mit/README.md) \[ [Homepage](http://moments.csail.mit.edu/) \]
+  - [Multi-Moments in Time](/tools/data/mmit/README.md) \[ [Homepage](http://moments.csail.mit.edu/challenge_iccv_2019.html) \]
+  - [HVU](/tools/data/hvu/README.md) \[ [Homepage](https://github.com/holistic-video-understanding/HVU-Dataset) \]
+  - [Jester](/tools/data/jester/README.md) \[ [Homepage](https://developer.qualcomm.com/software/ai-datasets/jester) \]
+  - [GYM](/tools/data/gym/README.md) \[ [Homepage](https://sdolivia.github.io/FineGym/) \]
+  - [ActivityNet](/tools/data/activitynet/README.md) \[ [Homepage](http://activity-net.org/) \]
+  - [Diving48](/tools/data/diving48/README.md) \[ [Homepage](http://www.svcl.ucsd.edu/projects/resound/dataset.html) \]
+  - [OmniSource](/tools/data/omnisource/README.md) \[ [Homepage](https://kennymckormick.github.io/omnisource/) \]
+
+- Temporal Action Detection
+
+  - [ActivityNet](/tools/data/activitynet/README.md) \[ [Homepage](http://activity-net.org/) \]
+  - [THUMOS14](/tools/data/thumos14/README.md) \[ [Homepage](https://www.crcv.ucf.edu/THUMOS14/download.html) \]
+
+- Spatial Temporal Action Detection
+
+  - [AVA](/tools/data/ava/README.md) \[ [Homepage](https://research.google.com/ava/index.html) \]
+  - [UCF101-24](/tools/data/ucf101_24/README.md) \[ [Homepage](http://www.thumos.info/download.html) \]
+  - [JHMDB](/tools/data/jhmdb/README.md) \[ [Homepage](http://jhmdb.is.tue.mpg.de/) \]
+
+- Skeleton-based Action Recognition
+
+  - [PoseC3D Skeleton Dataset](/tools/data/skeleton/README.md) \[ [Homepage](https://kennymckormick.github.io/posec3d/) \]
+
+The supported datasets are listed above.
+We provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`.
+Below is the detailed tutorials of data deployment for each dataset.
diff --git a/docs/en/user_guides/useful_tools.md b/docs/en/useful_tools.md
similarity index 98%
rename from docs/en/user_guides/useful_tools.md
rename to docs/en/useful_tools.md
index 2fe3b1977a..943303b82c 100644
--- a/docs/en/user_guides/useful_tools.md
+++ b/docs/en/useful_tools.md
@@ -1,4 +1,4 @@
-# Other Useful Tools
+# Useful Tools
 
 Apart from training/testing scripts, We provide lots of useful tools under the `tools/` directory.
 
@@ -6,7 +6,7 @@ Apart from training/testing scripts, We provide lots of useful tools under the `
 
 <!-- TOC -->
 
-- [Other Useful Tools](#other-useful-tools)
+- [Useful Tools](#useful-tools)
   - [Useful Tools Link](#useful-tools-link)
   - [Model Conversion](#model-conversion)
     - [Prepare a model for publishing](#prepare-a-model-for-publishing)
diff --git a/docs/en/user_guides/2_data_prepare.md b/docs/en/user_guides/2_data_prepare.md
deleted file mode 100644
index e3bcc9f0e0..0000000000
--- a/docs/en/user_guides/2_data_prepare.md
+++ /dev/null
@@ -1,152 +0,0 @@
-# Tutorial 2: Prepare Datasets
-
-We provide some tips for MMAction2 data preparation in this file.
-
-<!-- TOC -->
-
-- [Notes on Video Data Format](#notes-on-video-data-format)
-- [Getting Data](#getting-data)
-  - [Prepare videos](#prepare-videos)
-  - [Extract frames](#extract-frames)
-    - [Alternative to denseflow](#alternative-to-denseflow)
-  - [Generate file list](#generate-file-list)
-  - [Prepare audio](#prepare-audio)
-
-<!-- TOC -->
-
-## Notes on Video Data Format
-
-MMAction2 supports two types of data format: raw frames and video. The former is widely used in previous projects such as [TSN](https://github.com/yjxiong/temporal-segment-networks).
-This is fast when SSD is available but fails to scale to the fast-growing datasets.
-(For example, the newest edition of [Kinetics](https://www.deepmind.com/open-source/kinetics) has 650K  videos and the total frames will take up several TBs.)
-The latter saves much space but has to do the computation intensive video decoding at execution time.
-To make video decoding faster, we support several efficient video loading libraries, such as [decord](https://github.com/zhreshold/decord), [PyAV](https://github.com/PyAV-Org/PyAV), etc.
-
-## Getting Data
-
-The following guide is helpful when you want to experiment with custom dataset.
-Similar to the datasets stated above, it is recommended organizing in `$MMACTION2/data/$DATASET`.
-
-### Prepare videos
-
-Please refer to the official website and/or the official script to prepare the videos.
-Note that the videos should be arranged in either
-
-- A two-level directory organized by `${CLASS_NAME}/${VIDEO_ID}`, which is recommended to be used for action recognition datasets (such as UCF101 and Kinetics)
-
-- A single-level directory, which is recommended to be used for action detection datasets or those with multiple annotations per video (such as THUMOS14).
-
-### Extract frames
-
-To extract both frames and optical flow, you can use the tool [denseflow](https://github.com/open-mmlab/denseflow) we wrote.
-Since different frame extraction tools produce different number of frames,
-it is beneficial to use the same tool to do both frame extraction and the flow computation, to avoid mismatching of frame counts.
-
-```shell
-python build_rawframes.py ${SRC_FOLDER} ${OUT_FOLDER} [--task ${TASK}] [--level ${LEVEL}] \
-    [--num-worker ${NUM_WORKER}] [--flow-type ${FLOW_TYPE}] [--out-format ${OUT_FORMAT}] \
-    [--ext ${EXT}] [--new-width ${NEW_WIDTH}] [--new-height ${NEW_HEIGHT}] [--new-short ${NEW_SHORT}] \
-    [--resume] [--use-opencv] [--mixed-ext]
-```
-
-- `SRC_FOLDER`: Folder of the original video.
-- `OUT_FOLDER`: Root folder where the extracted frames and optical flow store.
-- `TASK`: Extraction task indicating which kind of frames to extract. Allowed choices are `rgb`, `flow`, `both`.
-- `LEVEL`: Directory level. 1 for the single-level directory or 2 for the two-level directory.
-- `NUM_WORKER`: Number of workers to build rawframes.
-- `FLOW_TYPE`: Flow type to extract, e.g., `None`, `tvl1`, `warp_tvl1`, `farn`, `brox`.
-- `OUT_FORMAT`: Output format for extracted frames, e.g., `jpg`, `h5`, `png`.
-- `EXT`: Video file extension, e.g., `avi`, `mp4`.
-- `NEW_WIDTH`: Resized image width of output.
-- `NEW_HEIGHT`: Resized image height of output.
-- `NEW_SHORT`: Resized image short side length keeping ratio.
-- `--resume`: Whether to resume optical flow extraction instead of overwriting.
-- `--use-opencv`: Whether to use OpenCV to extract rgb frames.
-- `--mixed-ext`: Indicate whether process video files with mixed extensions.
-
-The recommended practice is
-
-1. set `$OUT_FOLDER` to be a folder located in SSD.
-2. symlink the link `$OUT_FOLDER` to `$MMACTION2/data/$DATASET/rawframes`.
-3. set `new-short` instead of using `new-width` and `new-height`.
-
-```shell
-ln -s ${YOUR_FOLDER} $MMACTION2/data/$DATASET/rawframes
-```
-
-#### Alternative to denseflow
-
-In case your device doesn't fulfill the installation requirement of [denseflow](https://github.com/open-mmlab/denseflow)(like Nvidia driver version), or you just want to see some quick demos about flow extraction, we provide a python script `tools/misc/flow_extraction.py` as an alternative to denseflow. You can use it for rgb frames and optical flow extraction from one or several videos. Note that the speed of the script is much slower than denseflow, since it runs optical flow algorithms on CPU.
-
-```shell
-python tools/misc/flow_extraction.py --input ${INPUT} [--prefix ${PREFIX}] [--dest ${DEST}] [--rgb-tmpl ${RGB_TMPL}] \
-    [--flow-tmpl ${FLOW_TMPL}] [--start-idx ${START_IDX}] [--method ${METHOD}] [--bound ${BOUND}] [--save-rgb]
-```
-
-- `INPUT`:  Videos for frame extraction, can be single video or a video list, the video list should be a txt file and just consists of filenames without directories.
-- `PREFIX`: The prefix of input videos, used when input is a video list.
-- `DEST`: The destination to save extracted frames.
-- `RGB_TMPL`: The template filename of rgb frames.
-- `FLOW_TMPL`: The template filename of flow frames.
-- `START_IDX`: The start index of extracted frames.
-- `METHOD`: The method used to generate flow.
-- `BOUND`: The maximum of optical flow.
-- `SAVE_RGB`: Also save extracted rgb frames.
-
-### Generate file list
-
-We provide a convenient script to generate annotation file list. You can use the following command to generate file lists given extracted frames / downloaded videos.
-
-```shell
-cd $MMACTION2
-python tools/data/build_file_list.py ${DATASET} ${SRC_FOLDER} [--rgb-prefix ${RGB_PREFIX}] \
-    [--flow-x-prefix ${FLOW_X_PREFIX}] [--flow-y-prefix ${FLOW_Y_PREFIX}] [--num-split ${NUM_SPLIT}] \
-    [--subset ${SUBSET}] [--level ${LEVEL}] [--format ${FORMAT}] [--out-root-path ${OUT_ROOT_PATH}] \
-    [--seed ${SEED}] [--shuffle]
-```
-
-- `DATASET`: Dataset to be prepared, e.g., `ucf101`, `kinetics400`, `thumos14`, `sthv1`, `sthv2`, etc.
-- `SRC_FOLDER`: Folder of the corresponding data format:
-  - "$MMACTION2/data/$DATASET/rawframes" if `--format rawframes`.
-  - "$MMACTION2/data/$DATASET/videos" if `--format videos`.
-- `RGB_PREFIX`: Name prefix of rgb frames.
-- `FLOW_X_PREFIX`: Name prefix of x flow frames.
-- `FLOW_Y_PREFIX`: Name prefix of y flow frames.
-- `NUM_SPLIT`: Number of split to file list.
-- `SUBSET`: Subset to generate file list. Allowed choice are `train`, `val`, `test`.
-- `LEVEL`: Directory level. 1 for the single-level directory or 2 for the two-level directory.
-- `FORMAT`: Source data format to generate file list. Allowed choices are `rawframes`, `videos`.
-- `OUT_ROOT_PATH`: Root path for output
-- `SEED`: Random seed.
-- `--shuffle`: Whether to shuffle the file list.
-
-### Prepare audio
-
-We also provide a simple script for audio waveform extraction and mel-spectrogram generation.
-
-```shell
-cd $MMACTION2
-python tools/data/extract_audio.py ${ROOT} ${DST_ROOT} [--ext ${EXT}] [--num-workers ${N_WORKERS}] \
-    [--level ${LEVEL}]
-```
-
-- `ROOT`: The root directory of the videos.
-- `DST_ROOT`: The destination root directory of the audios.
-- `EXT`: Extension of the video files. e.g., `mp4`.
-- `N_WORKERS`: Number of processes to be used.
-
-After extracting audios, you are free to decode and generate the spectrogram on-the-fly such as [this](/configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio.py). As for the annotations, you can directly use those of the rawframes as long as you keep the relative position of audio files same as the rawframes directory. However, extracting spectrogram on-the-fly is slow and bad for prototype iteration. Therefore, we also provide a script (and many useful tools to play with) for you to generation spectrogram off-line.
-
-```shell
-cd $MMACTION2
-python tools/data/build_audio_features.py ${AUDIO_HOME_PATH} ${SPECTROGRAM_SAVE_PATH} [--level ${LEVEL}] \
-    [--ext $EXT] [--num-workers $N_WORKERS] [--part $PART]
-```
-
-- `AUDIO_HOME_PATH`: The root directory of the audio files.
-- `SPECTROGRAM_SAVE_PATH`: The destination root directory of the audio features.
-- `EXT`: Extension of the audio files. e.g., `m4a`.
-- `N_WORKERS`: Number of processes to be used.
-- `PART`: Determines how many parts to be splited and which part to run. e.g., `2/5` means splitting all files into 5-fold and executing the 2nd part. This is useful if you have several machines.
-
-The annotations for audio spectrogram features are identical to those of rawframes. You can simply make a copy of `dataset_[train/val]_list_rawframes.txt` and rename it as `dataset_[train/val]_list_audio_feature.txt`
diff --git a/docs/en/user_guides/3_inference.md b/docs/en/user_guides/Inference.md
similarity index 95%
rename from docs/en/user_guides/3_inference.md
rename to docs/en/user_guides/Inference.md
index 11b07f0519..20e14b4ee0 100644
--- a/docs/en/user_guides/3_inference.md
+++ b/docs/en/user_guides/Inference.md
@@ -1,9 +1,9 @@
-# Tutorial 3: Inference with existing models
+# Inference with existing models
 
 MMAction2 provides pre-trained models for video understanding in [Model Zoo](../modelzoo.md).
 This note will show **how to use existing models to inference on given video**.
 
-As for how to test existing models on standard datasets, please see this [guide](./4_train_test.md#test)
+As for how to test existing models on standard datasets, please see this [guide](./train_test.md#test)
 
 ## Inference on a given video
 
diff --git a/docs/en/user_guides/1_config.md b/docs/en/user_guides/config.md
similarity index 98%
rename from docs/en/user_guides/1_config.md
rename to docs/en/user_guides/config.md
index 308ec70f17..d847ae9557 100644
--- a/docs/en/user_guides/1_config.md
+++ b/docs/en/user_guides/config.md
@@ -1,4 +1,4 @@
-# Tutorial 1: Learn about Configs
+# Learn about Configs
 
 We use python files as configs, incorporate modular and inheritance design into our config system, which is convenient to conduct various experiments.
 You can find all the provided configs under `$MMAction2/configs`. If you wish to inspect the config file,
@@ -6,12 +6,13 @@ you may run `python tools/analysis_tools/print_config.py /PATH/TO/CONFIG` to see
 
 <!-- TOC -->
 
-- [Modify config through script arguments](#modify-config-through-script-arguments)
-- [Config File Structure](#config-file-structure)
-- [Config File Naming Convention](#config-file-naming-convention)
-  - [Config System for Action Recognition](#config-system-for-action-recognition)
-  - [Config System for Spatio-Temporal Action Detection](#config-system-for-spatio-temporal-action-detection)
-  - [Config System for Action localization](#config-system-for-action-localization)
+- [Learn about Configs](#learn-about-configs)
+  - [Modify config through script arguments](#modify-config-through-script-arguments)
+  - [Config File Structure](#config-file-structure)
+  - [Config File Naming Convention](#config-file-naming-convention)
+    - [Config System for Action Recognition](#config-system-for-action-recognition)
+    - [Config System for Spatio-Temporal Action Detection](#config-system-for-spatio-temporal-action-detection)
+    - [Config System for Action localization](#config-system-for-action-localization)
 
 <!-- TOC -->
 
diff --git a/docs/en/user_guides/prepare_dataset.md b/docs/en/user_guides/prepare_dataset.md
new file mode 100644
index 0000000000..cd4225aaa0
--- /dev/null
+++ b/docs/en/user_guides/prepare_dataset.md
@@ -0,0 +1,263 @@
+# Prepare Dataset
+
+MMAction2 supports many existing datasets. In this chapter, we will lead you to prepare datasets for MMAction2.
+
+- [Prepare Dataset](#prepare-dataset)
+  - [Notes on Video Data Format](#notes-on-video-data-format)
+  - [Use built-in datasets](#use-built-in-datasets)
+  - [Use a custom dataset](#use-a-custom-dataset)
+    - [Action Recognition](#action-recognition)
+    - [Skeleton-based Action Recognition](#skeleton-based-action-recognition)
+    - [Spatio-temporal Action Detection](#spatio-temporal-action-detection)
+    - [Temporal Action Localization](#temporal-action-localization)
+  - [Use mixed datasets for training](#use-mixed-datasets-for-training)
+    - [Repeat dataset](#repeat-dataset)
+  - [Browse dataset](#browse-dataset)
+
+## Notes on Video Data Format
+
+MMAction2 supports two types of data formats: raw frames and video. The former is widely used in previous projects such as [TSN](https://github.com/yjxiong/temporal-segment-networks).
+This is fast when SSD is available but fails to scale to the fast-growing datasets.
+(For example, the newest edition of [Kinetics](https://www.deepmind.com/open-source/kinetics) has 650K  videos and the total frames will take up several TBs.)
+The latter saves much space but has to do the computation intensive video decoding at execution time.
+To make video decoding faster, we support several efficient video loading libraries, such as [decord](https://github.com/zhreshold/decord), [PyAV](https://github.com/PyAV-Org/PyAV), etc.
+
+## Use built-in datasets
+
+MMAction2 already supports many datasets, we provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`, please refer to [supported datasets](../supported_datasets.md) for details to prepare specific datasets.
+
+## Use a custom dataset
+
+The simplest way is to convert your dataset to existing dataset formats:
+
+- `RawFrameDataset` and `VideoDataset` for [Action Recognition](#action-recognition)
+- `PoseDataset` for [Skeleton-based Action Recognition](#skeleton-based-action-recognition)
+- `AVADataset` for [Spatio-temporal Action Detection](#spatio-temporal-action-detection)
+- `ActivityNetDataset` for [Temporal Action Localization](#temporal-action-localization)
+
+After the data pre-processing, the users need to further modify the config files to use the dataset.
+Here is an example of using a custom dataset in rawframe format.
+
+In `configs/task/method/my_custom_config.py`:
+
+```python
+...
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'path/to/your/root'
+data_root_val = 'path/to/your/root_val'
+ann_file_train = 'data/custom/custom_train_list.txt'
+ann_file_val = 'data/custom/custom_val_list.txt'
+ann_file_test = 'data/custom/custom_val_list.txt'
+...
+data = dict(
+    videos_per_gpu=32,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        ...),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        ...),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        ...))
+...
+```
+
+### Action Recognition
+
+There are two kinds of annotation files for action recognition.
+
+- rawframe annotaiton for `RawFrameDataset`
+
+  The annotation of a rawframe dataset is a text file with multiple lines,
+  and each line indicates `frame_directory` (relative path) of a video,
+  `total_frames` of a video and the `label` of a video, which are split by a whitespace.
+
+  Here is an example.
+
+  ```
+  some/directory-1 163 1
+  some/directory-2 122 1
+  some/directory-3 258 2
+  some/directory-4 234 2
+  some/directory-5 295 3
+  some/directory-6 121 3
+  ```
+
+- video annotation for `VideoDataset`
+
+  The annotation of a video dataset is a text file with multiple lines,
+  and each line indicates a sample video with the `filepath` (relative path) and `label`,
+  which are split by a whitespace.
+
+  Here is an example.
+
+  ```
+  some/path/000.mp4 1
+  some/path/001.mp4 1
+  some/path/002.mp4 2
+  some/path/003.mp4 2
+  some/path/004.mp4 3
+  some/path/005.mp4 3
+  ```
+
+### Skeleton-based Action Recognition
+
+The task recognizes the action class based on the skeleton sequence (time sequence of keypoints). We provide some methods to build your custom skeleton dataset.
+
+- Build from RGB video data
+
+  You need to extract keypoints data from video and convert it to a supported format, we provide a [tutorial](/configs/skeleton/posec3d/custom_dataset_training.md) with detailed instructions.
+
+- Build from existing keypoint data
+
+  Assuming that you already have keypoint data in coco formats, you can gather them into a pickle file.
+
+  Each pickle file corresponds to an action recognition dataset. The content of a pickle file is a dictionary with two fields: `split` and `annotations`
+
+  1. Split: The value of the `split` field is a dictionary: the keys are the split names, while the values are lists of video identifiers that belong to the specific clip.
+  2. Annotations: The value of the `annotations` field is a list of skeleton annotations, each skeleton annotation is a dictionary, containing the following fields:
+     - `frame_dir` (str): The identifier of the corresponding video.
+     - `total_frames` (int): The number of frames in this video.
+     - `img_shape` (tuple\[int\]): The shape of a video frame, a tuple with two elements, in the format of `(height, width)`. Only required for 2D skeletons.
+     - `original_shape` (tuple\[int\]): Same as `img_shape`.
+     - `label` (int): The action label.
+     - `keypoint` (np.ndarray, with shape `[M x T x V x C]`): The keypoint annotation.
+       - M: number of persons;
+       - T: number of frames (same as `total_frames`);
+       - V: number of keypoints (25 for NTURGB+D 3D skeleton, 17 for CoCo, 18 for OpenPose, etc. );
+       - C: number of dimensions for keypoint coordinates (C=2 for 2D keypoint, C=3 for 3D keypoint).
+     - `keypoint_score` (np.ndarray, with shape `[M x T x V]`): The confidence score of keypoints. Only required for 2D skeletons.
+
+  Here is an example:
+
+  ```
+  {
+      "split":
+          {
+              'xsub_train':
+                  ['S001C001P001R001A001', ...],
+              'xsub_val':
+                  ['S001C001P003R001A001', ...],
+              ...
+          }
+
+      "annotations:
+          [
+              {
+                  {
+                      'frame_dir': 'S001C001P001R001A001',
+                      'label': 0,
+                      'img_shape': (1080, 1920),
+                      'original_shape': (1080, 1920),
+                      'total_frames': 103,
+                      'keypoint': array([[[[1032. ,  334.8], ...]]])
+                      'keypoint_score': array([[[0.934 , 0.9766, ...]]])
+                  },
+                  {
+                      'frame_dir': 'S001C001P003R001A001',
+                      ...
+                  },
+                  ...
+
+              }
+          ]
+  }
+  ```
+
+  Support other keypoint formats needs further modification, please refer to [customize dataset](../advanced_guides/customize_dataset.md).
+
+### Spatio-temporal Action Detection
+
+MMAction2 supports the task based on `AVADataset`. The annotation contains groundtruth bbox and proposal bbox.
+
+- groundtruth bbox
+  groundtruth bbox is a csv file with multiple lines, and each line is a detection sample of one frame, with following formats:
+
+  video_identifier, time_stamp, lt_x, lt_y, rb_x, rb_y, label, entity_id
+  each field means:
+  `video_identifier` : The identifier of the corresponding video
+  `time_stamp`: The time stamp of current frame
+  `lt_x`: The normalized x-coordinate of the left top point of bounding box
+  `lt_y`: The normalized y-coordinate of the left top point of bounding box
+  `rb_y`: The normalized x-coordinate of the right bottom point of bounding box
+  `rb_y`: The normalized y-coordinate of the right bottom point of bounding box
+  `label`: The action label
+  `entity_id`: a unique integer allowing this box to be linked to other boxes depicting the same person in adjacent frames of this video
+
+  Here is an example.
+
+  ```
+  _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,12,0
+  _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,74,0
+  ...
+  ```
+
+- proposal bbox
+  proposal bbox is a pickle file generated by a person detector, and usually needs to be fine-tuned on the target dataset. The pickle file contains a dict with below data structure:
+
+  `{'video_identifier,time_stamp': bbox_info}`
+
+  video_identifier (str): The identifier of the corresponding video
+  time_stamp (int): The time stamp of current frame
+  bbox_info (np.ndarray, with shape `[n, 5]`): Detected bbox, \<x1> \<y1> \<x2> \<y2> \<score>. x1, x2, y1, y2 are normalized with respect to frame size, which are between 0.0-1.0.
+
+### Temporal Action Localization
+
+We support Temporal Action Localization based on `ActivityNetDataset`. The annotation of ActivityNet dataset is a json file. Each key is a video name and the corresponding value is the meta data and annotation for the video.
+
+Here is an example.
+
+```
+{
+  "video1": {
+      "duration_second": 211.53,
+      "duration_frame": 6337,
+      "annotations": [
+          {
+              "segment": [
+                  30.025882995319815,
+                  205.2318595943838
+              ],
+              "label": "Rock climbing"
+          }
+      ],
+      "feature_frame": 6336,
+      "fps": 30.0,
+      "rfps": 29.9579255898
+  },
+  "video2": {...
+  }
+  ...
+}
+```
+
+## Use mixed datasets for training
+
+MMAction2 also supports to mix dataset for training. Currently it supports to repeat dataset.
+
+### Repeat dataset
+
+We use `RepeatDataset` as wrapper to repeat the dataset. For example, suppose the original dataset as `Dataset_A`,
+to repeat it, the config looks like the following
+
+```python
+dataset_A_train = dict(
+        type='RepeatDataset',
+        times=N,
+        dataset=dict(  # This is the original config of Dataset_A
+            type='Dataset_A',
+            ...
+            pipeline=train_pipeline
+        )
+    )
+```
+
+## Browse dataset
+
+coming soon...
diff --git a/docs/en/user_guides/4_train_test.md b/docs/en/user_guides/train_test.md
similarity index 99%
rename from docs/en/user_guides/4_train_test.md
rename to docs/en/user_guides/train_test.md
index a67448fde3..653fccdc34 100644
--- a/docs/en/user_guides/4_train_test.md
+++ b/docs/en/user_guides/train_test.md
@@ -1,4 +1,4 @@
-# Tutorial 4: Training and Test
+# Training and Test
 
 ## Training
 
diff --git a/docs/en/user_guides/visualization.md b/docs/en/user_guides/visualization.md
deleted file mode 100644
index 2d4518bcdb..0000000000
--- a/docs/en/user_guides/visualization.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Visualization Tools
-
-## Visualize dataset
-
-You can use `tools/analysis_tools/browse_dataset.py` to visualize video datasets:
-
-```bash
-python tools/analysis_tools/browse_dataset.py ${CONFIG_FILE} [ARGS]
-```
-
-| ARGS                            | Description                                                                                                                                                               |
-| ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `CONFIG_FILE`                   | The path to the config file.                                                                                                                                              |
-| `--output-dir OUTPUT_DIR`       | If there is no display interface, you can save the visualization results to `OUTPUT_DIR`. Defaults to None                                                                |
-| `--show-frames`                 | Display the frames of the video if you have the display interface. Defaults to False.                                                                                     |
-| `--phase PHASE`                 | Phase of the dataset to visualize, accept `train`, `test` and `val`. Defaults to `train`.                                                                                 |
-| `--show-number SHOW_NUMBER`     | Number of images selected to visualize, must bigger than 0. Jf the number is bigger than length of dataset, show all the images in dataset. Defaults to "sys.maxsize", show all images in dataset |
-| `--show-interval SHOW_INTERVAL` | The interval of show (s). Defaults to 2.                                                                                                                                  |
-| `--mode MODE`                   | Display mode: display original videos or transformed videos. `original` means show videos load from disk while `transformed` means to show videos after transformed. Defaults to `transformed`. |
-| `--cfg-options CFG_OPTIONS`     | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either `key="[a,b]"` or `key=a,b`. The argument also allows nested list/tuple values, e.g. `key="[(a,b),(c,d)]"`. Note that the quotation marks are necessary and that no white space is allowed. |
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 59e3e49b53..2b69d6d2af 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -16,7 +16,7 @@ You can switch between Chinese and English documents in the lower-left corner of
    user_guides/1_config.md
    user_guides/2_data_prepare.md
    user_guides/3_inference.md
-   user_guides/4_train_test.md
+   user_guides/train_test.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/zh_cn/user_guides/3_inference.md b/docs/zh_cn/user_guides/3_inference.md
index 99433263df..14374ef432 100644
--- a/docs/zh_cn/user_guides/3_inference.md
+++ b/docs/zh_cn/user_guides/3_inference.md
@@ -3,7 +3,7 @@
 MMAction2 在 [Model Zoo](../modelzoo.md) 中提供预训练的视频理解模型。
 本教程将展示**如何使用现有模型对给定视频进行推理**。
 
-至于如何在标准数据集上测试现有模型，请参阅这该[指南](./4_train_test.md#test)
+至于如何在标准数据集上测试现有模型，请参阅这该[指南](./train_test.md#test)
 
 ## 给定视频的推理
 
diff --git a/src/pytorch-sphinx-theme b/src/pytorch-sphinx-theme
new file mode 160000
index 0000000000..6f42dcf38c
--- /dev/null
+++ b/src/pytorch-sphinx-theme
@@ -0,0 +1 @@
+Subproject commit 6f42dcf38c529653bdf3347f551cb037a1a0f1cf
diff --git a/tools/visualizations/browse_dataset.py b/tools/visualizations/browse_dataset.py
index e6cf9b82c4..6fb720521e 100644
--- a/tools/visualizations/browse_dataset.py
+++ b/tools/visualizations/browse_dataset.py
@@ -21,13 +21,9 @@
 def parse_args():
     parser = argparse.ArgumentParser(description='Browse a dataset')
     parser.add_argument('config', help='train config file path')
-    parser.add_argument('--label', default=None, type=str, help='label file')
     parser.add_argument(
-        '--output-dir',
-        '-o',
-        default=None,
-        type=str,
-        help='If there is no display interface, you can save it.')
+        'output_dir', default=None, type=str, help='output directory')
+    parser.add_argument('--label', default=None, type=str, help='label file')
     parser.add_argument(
         '--phase',
         '-p',
diff --git a/tools/visualizations/vis_scheduler.py b/tools/visualizations/vis_scheduler.py
index 6e1b744862..17daa34e6b 100644
--- a/tools/visualizations/vis_scheduler.py
+++ b/tools/visualizations/vis_scheduler.py
@@ -16,58 +16,7 @@
 from mmengine.runner import Runner
 from mmengine.visualization import Visualizer
 from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn
-
-
-class SimpleModel(BaseModel):
-    """simple model that do nothing in train_step."""
-
-    def __init__(self):
-        super(SimpleModel, self).__init__()
-        self.data_preprocessor = nn.Identity()
-        self.conv = nn.Conv2d(1, 1, 1)
-
-    def forward(self, inputs, data_samples, mode='tensor'):
-        pass
-
-    def train_step(self, data, optim_wrapper):
-        pass
-
-
-class ParamRecordHook(Hook):
-
-    def __init__(self, by_epoch):
-        super().__init__()
-        self.by_epoch = by_epoch
-        self.lr_list = []
-        self.momentum_list = []
-        self.task_id = 0
-        self.progress = Progress(BarColumn(), MofNCompleteColumn(),
-                                 TextColumn('{task.description}'))
-
-    def before_train(self, runner):
-        if self.by_epoch:
-            total = runner.train_loop.max_epochs
-            self.task_id = self.progress.add_task(
-                'epochs', start=True, total=total)
-        else:
-            total = runner.train_loop.max_iters
-            self.task_id = self.progress.add_task(
-                'iters', start=True, total=total)
-        self.progress.start()
-
-    def after_train_epoch(self, runner):
-        if self.by_epoch:
-            self.progress.update(self.task_id, advance=1)
-
-    def after_train_iter(self, runner, batch_idx, data_batch, outputs):
-        if not self.by_epoch:
-            self.progress.update(self.task_id, advance=1)
-        self.lr_list.append(runner.optim_wrapper.get_lr()['lr'][0])
-        self.momentum_list.append(
-            runner.optim_wrapper.get_momentum()['momentum'][0])
-
-    def after_train(self, runner):
-        self.progress.stop()
+from torch.utils.data import DataLoader
 
 
 def parse_args():
@@ -130,6 +79,58 @@ def parse_args():
     return args
 
 
+class SimpleModel(BaseModel):
+    """simple model that do nothing in train_step."""
+
+    def __init__(self):
+        super(SimpleModel, self).__init__()
+        self.data_preprocessor = nn.Identity()
+        self.conv = nn.Conv2d(1, 1, 1)
+
+    def forward(self, inputs, data_samples, mode='tensor'):
+        pass
+
+    def train_step(self, data, optim_wrapper):
+        pass
+
+
+class ParamRecordHook(Hook):
+
+    def __init__(self, by_epoch):
+        super().__init__()
+        self.by_epoch = by_epoch
+        self.lr_list = []
+        self.momentum_list = []
+        self.task_id = 0
+        self.progress = Progress(BarColumn(), MofNCompleteColumn(),
+                                 TextColumn('{task.description}'))
+
+    def before_train(self, runner):
+        if self.by_epoch:
+            total = runner.train_loop.max_epochs
+            self.task_id = self.progress.add_task(
+                'epochs', start=True, total=total)
+        else:
+            total = runner.train_loop.max_iters
+            self.task_id = self.progress.add_task(
+                'iters', start=True, total=total)
+        self.progress.start()
+
+    def after_train_epoch(self, runner):
+        if self.by_epoch:
+            self.progress.update(self.task_id, advance=1)
+
+    def after_train_iter(self, runner, batch_idx, data_batch, outputs):
+        if not self.by_epoch:
+            self.progress.update(self.task_id, advance=1)
+        self.lr_list.append(runner.optim_wrapper.get_lr()['lr'][0])
+        self.momentum_list.append(
+            runner.optim_wrapper.get_momentum()['momentum'][0])
+
+    def after_train(self, runner):
+        self.progress.stop()
+
+
 def plot_curve(lr_list, args, param_name, iters_per_epoch, by_epoch=True):
     """Plot learning rate vs iter graph."""
     try:
@@ -186,6 +187,7 @@ def simulate_train(data_loader, cfg, by_epoch):
         param_scheduler=cfg.param_scheduler,
         default_scope=cfg.default_scope,
         default_hooks=default_hooks,
+        auto_scale_lr=cfg.get('auto_scale_lr'),
         visualizer=MagicMock(spec=Visualizer),
         custom_hooks=cfg.get('custom_hooks', None))
 
@@ -231,14 +233,13 @@ def main():
         from mmaction.registry import DATASETS
         dataset_size = len(DATASETS.build(cfg.train_dataloader.dataset))
         print(f'dataset is {dataset_size}')
-        # dataset_size = len(build_dataset(cfg.train_dataloader.dataset))
     else:
         dataset_size = args.dataset_size or batch_size
 
-    class FakeDataloader(list):
-        dataset = MagicMock(metainfo=None)
-
-    data_loader = FakeDataloader(range(dataset_size // batch_size))
+    data_loader = DataLoader(range(dataset_size), batch_size)
+    assert len(data_loader) > 0, \
+        'Please decrease batchsize to make sure that ' \
+        'a epoch at least have one iteration!'
     dataset_info = (
         f'\nDataset infos:'
         f'\n - Dataset size: {dataset_size}'

From 1d261c93b94bd49c00643b57bead5431853a0074 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Thu, 6 Apr 2023 18:41:29 +0800
Subject: [PATCH 33/36] [Feat] support training uniformer (#2221)

---
 configs/recognition/uniformerv2/README.md     |  66 +-
 .../k710_channel_map/label_map_k710.txt       | 710 ++++++++++++++++++
 .../k710_channel_map/map_k400.json            |   1 +
 .../k710_channel_map/map_k600.json            |   1 +
 .../k710_channel_map/map_k700.json            |   1 +
 configs/recognition/uniformerv2/metafile.yml  |  97 ++-
 ...etics710-kinetics-k400-pre_u8_mitv1-rgb.py | 110 ++-
 ...clip-kinetics710-pre_u8_kinetics400-rgb.py | 122 ++-
 ...clip-kinetics710-pre_u8_kinetics600-rgb.py | 122 ++-
 ...clip-kinetics710-pre_u8_kinetics700-rgb.py | 122 ++-
 ...base-p16-res224_clip_u8_kinetics400-rgb.py | 163 ++++
 ...base-p16-res224_clip_u8_kinetics700-rgb.py | 163 ++++
 mmaction/models/backbones/uniformer.py        |   8 +-
 mmaction/models/backbones/uniformerv2.py      |  35 +-
 mmaction/models/heads/__init__.py             |   3 +-
 mmaction/models/heads/uniformer_head.py       |  98 +++
 tests/models/backbones/test_uniformerv2.py    |   2 +
 tests/models/utils/test_gradcam.py            |   4 +-
 tests/utils/test_misc.py                      |   4 +
 tests/visualization/test_action_visualizer.py |   4 +
 tests/visualization/test_video_backend.py     |   4 +
 21 files changed, 1727 insertions(+), 113 deletions(-)
 create mode 100644 configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt
 create mode 100644 configs/recognition/uniformerv2/k710_channel_map/map_k400.json
 create mode 100644 configs/recognition/uniformerv2/k710_channel_map/map_k600.json
 create mode 100644 configs/recognition/uniformerv2/k710_channel_map/map_k700.json
 create mode 100644 configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics400-rgb.py
 create mode 100644 configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics700-rgb.py
 create mode 100644 mmaction/models/heads/uniformer_head.py

diff --git a/configs/recognition/uniformerv2/README.md b/configs/recognition/uniformerv2/README.md
index 73855f13f0..d6e57c7bf9 100644
--- a/configs/recognition/uniformerv2/README.md
+++ b/configs/recognition/uniformerv2/README.md
@@ -20,51 +20,53 @@ Learning discriminative spatiotemporal representation is the key problem of vide
 
 ### Kinetics-400
 
-| uniform sampling |   resolution   |       backbone       | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                       config                                                        |                                                                                         ckpt                                                                                         |
-| :--------------: | :------------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|        8         | short-side 320 |   UniFormerV2-B/16   |   85.8   |   97.1   |                                           85.6                                            |                                           97.0                                            |         85.8         |         97.1         | 4 clips x 3 crop | 0.1T  |  115M  |  [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py)  |  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth)  |
-|        8         | short-side 320 |   UniFormerV2-L/14   |   88.7   |   98.1   |                                           88.8                                            |                                           98.1                                            |         88.7         |         98.1         | 4 clips x 3 crop | 0.7T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py)  | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth)  |
-|        16        | short-side 320 |   UniFormerV2-L/14   |   89.0   |   98.2   |                                           89.1                                            |                                           98.2                                            |         89.0         |         98.2         | 4 clips x 3 crop | 1.3T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth) |
-|        32        | short-side 320 |   UniFormerV2-L/14   |   89.3   |   98.2   |                                           89.3                                            |                                           98.2                                            |         89.4         |         98.2         | 2 clips x 3 crop | 2.7T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth) |
-|        32        | short-side 320 | UniFormerV2-L/14@336 |   89.5   |   98.4   |                                           89.7                                            |                                           98.3                                            |         89.5         |         98.4         | 2 clips x 3 crop | 6.3T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth) |
+| uniform sampling |   resolution   |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :--------------: | :------------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         | short-side 320 |    UniFormerV2-B/16    |       clip       |   84.3   |   96.4   |                                           84.4                                            |                                           96.3                                            |          -           |          -           | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) |                 [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth)                 |                 [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log)                 |
+|        8         | short-side 320 |    UniFormerV2-B/16    | clip-kinetics710 |   85.8   |   97.1   |                                           85.6                                            |                                           97.0                                            |          -           |          -           | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log) |
+|        8         | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   88.7   |   98.1   |                                           88.8                                            |                                           98.1                                            |         88.7         |         98.1         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth)                                   |                                                                                                                  -                                                                                                                  |
+|        16        | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.0   |   98.2   |                                           89.1                                            |                                           98.2                                            |         89.0         |         98.2         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        | short-side 320 |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.3   |   98.2   |                                           89.3                                            |                                           98.2                                            |         89.4         |         98.2         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        | short-side 320 | UniFormerV2-L/14@336\* | clip-kinetics710 |   89.5   |   98.4   |                                           89.7                                            |                                           98.3                                            |         89.5         |         98.4         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth)                                  |                                                                                                                  -                                                                                                                  |
 
 ### Kinetics-600
 
-| uniform sampling | resolution |       backbone       | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                       config                                                        |                                                                                         ckpt                                                                                         |
-| :--------------: | :--------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|        8         |    Raw     |   UniFormerV2-B/16   |   86.4   |   97.3   |                                           86.1                                            |                                           97.2                                            |         85.5         |         97.0         | 4 clips x 3 crop | 0.1T  |  115M  |  [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py)  |  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-c62c4da4.pth)  |
-|        8         |    Raw     |   UniFormerV2-L/14   |   89.0   |   98.3   |                                           89.0                                            |                                           98.2                                            |         87.5         |         98.0         | 4 clips x 3 crop | 0.7T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py)  | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth)  |
-|        16        |    Raw     |   UniFormerV2-L/14   |   89.4   |   98.3   |                                           89.4                                            |                                           98.3                                            |         87.8         |         98.0         | 4 clips x 3 crop | 1.3T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth) |
-|        32        |    Raw     |   UniFormerV2-L/14   |   89.2   |   98.3   |                                           89.5                                            |                                           98.3                                            |         87.7         |         98.1         | 2 clips x 3 crop | 2.7T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth) |
-|        32        |    Raw     | UniFormerV2-L/14@336 |   89.8   |   98.5   |                                           89.9                                            |                                           98.5                                            |         88.8         |         98.3         | 2 clips x 3 crop | 6.3T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth) |
+| uniform sampling | resolution |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710 |   86.4   |   97.3   |                                           86.1                                            |                                           97.2                                            |          -           |          -           | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.0   |   98.3   |                                           89.0                                            |                                           98.2                                            |         87.5         |         98.0         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth)                                   |                                                                                                                  -                                                                                                                  |
+|        16        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.4   |   98.3   |                                           89.4                                            |                                           98.3                                            |         87.8         |         98.0         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   89.2   |   98.3   |                                           89.5                                            |                                           98.3                                            |         87.7         |         98.1         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710 |   89.8   |   98.5   |                                           89.9                                            |                                           98.5                                            |         88.8         |         98.3         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth)                                  |                                                                                                                  -                                                                                                                  |
 
 ### Kinetics-700
 
-| uniform sampling | resolution |       backbone       | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                       config                                                        |                                                                                         ckpt                                                                                         |
-| :--------------: | :--------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|        8         |    Raw     |   UniFormerV2-B/16   |   76.3   |   92.9   |                                           76.3                                            |                                           92.7                                            |         75.1         |         92.5         | 4 clips x 3 crop | 0.1T  |  115M  |  [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py)  |  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-8a7c4ac4.pth)  |
-|        8         |    Raw     |   UniFormerV2-L/14   |   80.8   |   95.2   |                                           80.8                                            |                                           95.4                                            |         79.4         |         94.8         | 4 clips x 3 crop | 0.7T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py)  | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth)  |
-|        16        |    Raw     |   UniFormerV2-L/14   |   81.2   |   95.6   |                                           81.2                                            |                                           95.6                                            |         79.2         |         95.0         | 4 clips x 3 crop | 1.3T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth) |
-|        32        |    Raw     |   UniFormerV2-L/14   |   81.4   |   95.7   |                                           81.5                                            |                                           95.7                                            |         79.8         |         95.3         | 2 clips x 3 crop | 2.7T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth) |
-|        32        |    Raw     | UniFormerV2-L/14@336 |   82.1   |   96.0   |                                           82.1                                            |                                           96.1                                            |         80.6         |         95.6         | 2 clips x 3 crop | 6.3T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth) |
+| uniform sampling | resolution |        backbone        |     pretrain     | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params |                                                         config                                                          |                                                                                                                          ckpt                                                                                                                          |                                                                                                                 log                                                                                                                 |
+| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         |    Raw     |    UniFormerV2-B/16    |       clip       |   75.9   |   92.9   |                                           75.8                                            |                                           92.8                                            |          -           |          -           | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) |                 [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth)                 |                 [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log)                 |
+|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710 |   76.3   |   92.9   |                                           76.3                                            |                                           92.7                                            |          -           |          -           | 4 clips x 3 crop | 0.1T  |  115M  |    [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py)    | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   80.8   |   95.2   |                                           80.8                                            |                                           95.4                                            |         79.4         |         94.8         | 4 clips x 3 crop | 0.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py)    |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth)                                   |                                                                                                                  -                                                                                                                  |
+|        16        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   81.2   |   95.6   |                                           81.2                                            |                                           95.6                                            |         79.2         |         95.0         | 4 clips x 3 crop | 1.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710 |   81.4   |   95.7   |                                           81.5                                            |                                           95.7                                            |         79.8         |         95.3         | 2 clips x 3 crop | 2.7T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth)                                  |                                                                                                                  -                                                                                                                  |
+|        32        |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710 |   82.1   |   96.0   |                                           82.1                                            |                                           96.1                                            |         80.6         |         95.6         | 2 clips x 3 crop | 6.3T  |  354M  |   [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py)   |                                  [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth)                                  |                                                                                                                  -                                                                                                                  |
 
 ### MiTv1
 
-| uniform sampling | resolution |       backbone       | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | testing protocol | FLOPs | params |                                                           config                                                           |                                                                                         ckpt                                                                                          |
-| :--------------: | :--------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-|        8         |    Raw     |   UniFormerV2-B/16   |   42.7   |   71.6   |                                           42.6                                            |                                           71.7                                            | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py)  | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-fddbc786.pth)  |
-|        8         |    Raw     |   UniFormerV2-L/14   |   47.0   |   76.1   |                                           47.0                                            |                                           76.1                                            | 4 clips x 3 crop | 0.7T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth) |
-|        8         |    Raw     | UniFormerV2-L/14@336 |   47.7   |   76.8   |                                           47.8                                            |                                           76.0                                            | 4 clips x 3 crop | 1.6T  |  354M  | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth) |
+| uniform sampling | resolution |        backbone        |           pretrain           | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | testing protocol | FLOPs | params |                                                              config                                                              |                                                                                                                                   ckpt                                                                                                                                   |                                                                                                                          log                                                                                                                          |
+| :--------------: | :--------: | :--------------------: | :--------------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|        8         |    Raw     |    UniFormerV2-B/16    | clip-kinetics710-kinetics400 |   42.3   |   71.5   |                                           42.6                                            |                                           71.7                                            | 4 clips x 3 crop | 0.1T  |  115M  | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   | clip-kinetics710-kinetics400 |   47.0   |   76.1   |                                           47.0                                            |                                           76.1                                            | 4 clips x 3 crop | 0.7T  |  354M  |    [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py)    |                                          [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth)                                           |                                                                                                                           -                                                                                                                           |
+|        8         |    Raw     | UniFormerV2-L/14@336\* | clip-kinetics710-kinetics400 |   47.7   |   76.8   |                                           47.8                                            |                                           76.0                                            | 4 clips x 3 crop | 1.6T  |  354M  |    [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py)    |                                          [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth)                                           |                                                                                                                           -                                                                                                                           |
 
 ### Kinetics-710
 
-| uniform sampling | resolution |       backbone       |                                     config                                     |                                     ckpt                                     |
-| :--------------: | :--------: | :------------------: | :----------------------------------------------------------------------------: | :--------------------------------------------------------------------------: |
-|        8         |    Raw     |   UniFormerV2-B/16   | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth) |
-|        8         |    Raw     |   UniFormerV2-L/14   | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20221219-bfaae587.pth) |
-|        8         |    Raw     | UniFormerV2-L/14@336 | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20221219-55878cdc.pth) |
+| uniform sampling | resolution |        backbone        | pretrain |                                  config                                   |                                  ckpt                                   |
+| :--------------: | :--------: | :--------------------: | :------: | :-----------------------------------------------------------------------: | :---------------------------------------------------------------------: |
+|        8         |    Raw     |   UniFormerV2-B/16\*   |   clip   | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth) |
+|        8         |    Raw     |   UniFormerV2-L/14\*   |   clip   | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20221219-bfaae587.pth) |
+|        8         |    Raw     | UniFormerV2-L/14@336\* |   clip   | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20221219-55878cdc.pth) |
 
-The models are ported from the repo [UniFormerV2](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) and tested on our data. Currently, we only support the testing of UniFormerV2 models, training will be available soon.
+The models with * are ported from the repo [UniFormerV2](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) and tested on our data. Due to computational limitations, we only support reliable training config for base model (i.e. UniFormerV2-B/16).
 
 1. The values in columns named after "reference" are the results of the original repo.
 2. The values in `top1/5 acc` is tested on the same data list as the original repo, and the label map is provided by [UniFormerV2](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL).
diff --git a/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt b/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt
new file mode 100644
index 0000000000..150f3447b4
--- /dev/null
+++ b/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt
@@ -0,0 +1,710 @@
+riding a bike	0
+marching	1
+dodgeball	2
+playing cymbals	3
+checking tires	4
+roller skating	5
+tasting beer	6
+clapping	7
+drawing	8
+juggling fire	9
+bobsledding	10
+petting animal (not cat)	11
+spray painting	12
+training dog	13
+eating watermelon	14
+building cabinet	15
+applauding	16
+playing harp	17
+inflating balloons	18
+sled dog racing	19
+wrestling	20
+pole vault	21
+hurling (sport)	22
+riding scooter	23
+shearing sheep	24
+sweeping floor	25
+eating carrots	26
+skateboarding	27
+dunking basketball	28
+disc golfing	29
+eating spaghetti	30
+playing flute	31
+riding mechanical bull	32
+making sushi	33
+trapezing	34
+picking apples	35
+stretching leg	36
+playing ukulele	37
+tying necktie	38
+skydiving	39
+playing cello	40
+jumping into pool	41
+shooting goal (soccer)	42
+trimming trees	43
+bookbinding	44
+ski jumping	45
+walking the dog	46
+riding unicycle	47
+shaving head	48
+hopscotch	49
+playing piano	50
+parasailing	51
+bartending	52
+kicking field goal	53
+finger snapping	54
+dining	55
+yawning	56
+peeling potatoes	57
+canoeing or kayaking	58
+front raises	59
+laughing	60
+dancing macarena	61
+digging	62
+reading newspaper	63
+hitting baseball	64
+clay pottery making	65
+exercising with an exercise ball	66
+playing saxophone	67
+shooting basketball	68
+washing hair	69
+lunge	70
+brushing hair	71
+curling hair	72
+kitesurfing	73
+tapping guitar	74
+bending back	75
+skipping rope	76
+situp	77
+folding paper	78
+cracking neck	79
+assembling computer	80
+cleaning gutters	81
+blowing out candles	82
+shaking hands	83
+dancing gangnam style	84
+windsurfing	85
+tap dancing	86
+skiing mono	87
+bandaging	88
+push up	89
+doing nails	90
+punching person (boxing)	91
+bouncing on trampoline	92
+scrambling eggs	93
+singing	94
+cleaning floor	95
+krumping	96
+drumming fingers	97
+snowmobiling	98
+gymnastics tumbling	99
+headbanging	100
+catching or throwing frisbee	101
+riding elephant	102
+bee keeping	103
+feeding birds	104
+snatch weight lifting	105
+mowing lawn	106
+fixing hair	107
+playing trumpet	108
+flying kite	109
+crossing river	110
+swinging legs	111
+sanding floor	112
+belly dancing	113
+sneezing	114
+clean and jerk	115
+side kick	116
+filling eyebrows	117
+shuffling cards	118
+recording music	119
+cartwheeling	120
+feeding fish	121
+folding clothes	122
+water skiing	123
+tobogganing	124
+blowing leaves	125
+smoking	126
+unboxing	127
+tai chi	128
+waxing legs	129
+riding camel	130
+slapping	131
+tossing salad	132
+capoeira	133
+playing cards	134
+playing organ	135
+playing violin	136
+playing drums	137
+tapping pen	138
+vault	139
+shoveling snow	140
+playing tennis	141
+getting a tattoo	142
+making a sandwich	143
+making tea	144
+grinding meat	145
+squat	146
+eating doughnuts	147
+ice fishing	148
+snowkiting	149
+kicking soccer ball	150
+playing controller	151
+giving or receiving award	152
+welding	153
+throwing discus	154
+throwing axe	155
+ripping paper	156
+swimming butterfly stroke	157
+air drumming	158
+blowing nose	159
+hockey stop	160
+taking a shower	161
+bench pressing	162
+planting trees	163
+pumping fist	164
+climbing tree	165
+tickling	166
+high kick	167
+waiting in line	168
+slacklining	169
+tango dancing	170
+hurdling	171
+carrying baby	172
+celebrating	173
+sharpening knives	174
+passing American football (in game)	175
+headbutting	176
+playing recorder	177
+brush painting	178
+person collecting garbage	179
+robot dancing	180
+shredding paper	181
+pumping gas	182
+rock climbing	183
+hula hooping	184
+braiding hair	185
+opening present	186
+texting	187
+decorating the christmas tree	188
+answering questions	189
+playing keyboard	190
+writing	191
+bungee jumping	192
+smelling feet	193
+eating burger	194
+playing accordion	195
+making pizza	196
+playing volleyball	197
+tasting food	198
+pushing cart	199
+spinning poi	200
+cleaning windows	201
+arm wrestling	202
+changing oil	203
+swimming breast stroke	204
+tossing coin	205
+deadlifting	206
+hoverboarding	207
+cutting watermelon	208
+cheerleading	209
+snorkeling	210
+washing hands	211
+eating cake	212
+pull ups	213
+surfing water	214
+eating hotdog	215
+holding snake	216
+playing harmonica	217
+ironing	218
+cutting nails	219
+golf chipping	220
+shot put	221
+hugging (not baby)	222
+playing clarinet	223
+faceplanting	224
+trimming or shaving beard	225
+drinking shots	226
+riding mountain bike	227
+tying bow tie	228
+swinging on something	229
+skiing crosscountry	230
+unloading truck	231
+cleaning pool	232
+jogging	233
+ice climbing	234
+mopping floor	235
+making the bed	236
+diving cliff	237
+washing dishes	238
+grooming dog	239
+weaving basket	240
+frying vegetables	241
+stomping grapes	242
+moving furniture	243
+cooking sausages (not on barbeque)	244
+doing laundry	245
+dyeing hair	246
+knitting	247
+reading book	248
+baby waking up	249
+punching bag	250
+surfing crowd	251
+cooking chicken	252
+pushing car	253
+springboard diving	254
+swing dancing	255
+massaging legs	256
+beatboxing	257
+breading or breadcrumbing	258
+somersaulting	259
+brushing teeth	260
+stretching arm	261
+juggling balls	262
+massaging person's head	263
+eating ice cream	264
+extinguishing fire	265
+hammer throw	266
+whistling	267
+crawling baby	268
+using remote controller (not gaming)	269
+playing cricket	270
+opening bottle (not wine)	271
+playing xylophone	272
+motorcycling	273
+driving car	274
+exercising arm	275
+passing American football (not in game)	276
+playing kickball	277
+sticking tongue out	278
+flipping pancake	279
+catching fish	280
+eating chips	281
+shaking head	282
+sword fighting	283
+playing poker	284
+cooking on campfire	285
+doing aerobics	286
+paragliding	287
+using segway	288
+folding napkins	289
+playing bagpipes	290
+gargling	291
+skiing slalom	292
+strumming guitar	293
+javelin throw	294
+waxing back	295
+riding or walking with horse	296
+plastering	297
+long jump	298
+parkour	299
+wrapping present	300
+egg hunting	301
+archery	302
+cleaning toilet	303
+swimming backstroke	304
+snowboarding	305
+catching or throwing baseball	306
+massaging back	307
+blowing glass	308
+playing guitar	309
+playing chess	310
+golf driving	311
+presenting weather forecast	312
+rock scissors paper	313
+high jump	314
+baking cookies	315
+using computer	316
+washing feet	317
+arranging flowers	318
+playing bass guitar	319
+spraying	320
+cutting pineapple	321
+waxing chest	322
+auctioning	323
+jetskiing	324
+sipping cup	325
+busking	326
+playing monopoly	327
+salsa dancing	328
+waxing eyebrows	329
+watering plants	330
+zumba	331
+chopping wood	332
+pushing wheelchair	333
+carving pumpkin	334
+building shed	335
+making jewelry	336
+catching or throwing softball	337
+bending metal	338
+ice skating	339
+dancing charleston	340
+abseiling	341
+climbing a rope	342
+crying	343
+cleaning shoes	344
+dancing ballet	345
+driving tractor	346
+triple jump	347
+throwing ball	348
+getting a haircut	349
+running on treadmill	350
+climbing ladder	351
+blasting sand	352
+playing trombone	353
+drop kicking	354
+country line dancing	355
+changing wheel (not on bike)	356
+feeding goats	357
+tying knot (not on a tie)	358
+setting table	359
+shaving legs	360
+kissing	361
+riding mule	362
+counting money	363
+laying bricks	364
+barbequing	365
+news anchoring	366
+smoking hookah	367
+cooking egg	368
+peeling apples	369
+yoga	370
+sharpening pencil	371
+dribbling basketball	372
+petting cat	373
+playing ice hockey	374
+milking cow	375
+shining shoes	376
+juggling soccer ball	377
+scuba diving	378
+playing squash or racquetball	379
+drinking beer	380
+sign language interpreting	381
+playing basketball	382
+breakdancing	383
+testifying	384
+making snowman	385
+golf putting	386
+playing didgeridoo	387
+biking through snow	388
+sailing	389
+jumpstyle dancing	390
+water sliding	391
+grooming horse	392
+massaging feet	393
+playing paintball	394
+making a cake	395
+bowling	396
+contact juggling	397
+applying cream	398
+playing badminton	399
+poaching eggs	400
+playing nose flute	401
+entering church	402
+closing door	403
+helmet diving	404
+doing sudoku	405
+coughing	406
+seasoning food	407
+peeling banana	408
+eating nachos	409
+waxing armpits	410
+shouting	411
+silent disco	412
+polishing furniture	413
+taking photo	414
+dealing cards	415
+putting wallpaper on wall	416
+uncorking champagne	417
+curling eyelashes	418
+brushing floor	419
+pulling espresso shot	420
+playing american football	421
+grooming cat	422
+playing checkers	423
+moving child	424
+stacking cups	425
+squeezing orange	426
+opening coconuts	427
+rolling eyes	428
+picking blueberries	429
+playing road hockey	430
+carving wood with a knife	431
+slicing onion	432
+saluting	433
+letting go of balloon	434
+breaking glass	435
+carrying weight	436
+mixing colours	437
+moving baby	438
+blending fruit	439
+pouring milk	440
+surveying	441
+making slime	442
+sieving	443
+walking with crutches	444
+flipping bottle	445
+playing billiards	446
+arresting	447
+listening with headphones	448
+spinning plates	449
+carving marble	450
+cutting cake	451
+shoot dance	452
+being excited	453
+petting horse	454
+splashing water	455
+filling cake	456
+stacking dice	457
+checking watch	458
+treating wood	459
+laying decking	460
+shooting off fireworks	461
+pouring wine	462
+pretending to be a statue	463
+steering car	464
+playing rounders	465
+looking in mirror	466
+jumping sofa	467
+lighting candle	468
+walking on stilts	469
+crocheting	470
+playing piccolo	471
+vacuuming car	472
+high fiving	473
+playing shuffleboard	474
+chasing	475
+pulling rope (game)	476
+being in zero gravity	477
+sanding wood	478
+decoupage	479
+using megaphone	480
+making latte art	481
+ski ballet	482
+playing oboe	483
+bouncing ball (not juggling)	484
+playing mahjong	485
+herding cattle	486
+swimming with sharks	487
+milking goat	488
+swimming with dolphins	489
+metal detecting	490
+playing slot machine	491
+polishing metal	492
+throwing tantrum	493
+lawn mower racing	494
+laying stone	495
+cutting orange	496
+skipping stone	497
+pouring beer	498
+making bubbles	499
+jaywalking	500
+leatherworking	501
+card stacking	502
+putting on eyeliner	503
+card throwing	504
+chewing gum	505
+falling off bike	506
+repairing puncture	507
+dumpster diving	508
+tiptoeing	509
+sleeping	510
+using circular saw	511
+cracking knuckles	512
+pinching	513
+chiseling wood	514
+playing rubiks cube	515
+weaving fabric	516
+fencing (sport)	517
+sword swallowing	518
+lighting fire	519
+vacuuming floor	520
+combing hair	521
+building lego	522
+playing pinball	523
+fly tying	524
+playing lute	525
+opening door	526
+waving hand	527
+rolling pastry	528
+chiseling stone	529
+threading needle	530
+playing dominoes	531
+opening wine bottle	532
+playing with trains	533
+steer roping	534
+playing field hockey	535
+separating eggs	536
+sewing	537
+talking on cell phone	538
+needle felting	539
+pushing wheelbarrow	540
+using a paint roller	541
+playing netball	542
+lifting hat	543
+massaging neck	544
+blowing bubble gum	545
+walking through snow	546
+docking boat	547
+clam digging	548
+marriage proposal	549
+packing	550
+sausage making	551
+licking	552
+scrapbooking	553
+flint knapping	554
+lock picking	555
+putting on lipstick	556
+sawing wood	557
+playing hand clapping games	558
+geocaching	559
+looking at phone	560
+making cheese	561
+poking bellybutton	562
+contorting	563
+fixing bicycle	564
+using a microscope	565
+using a wrench	566
+doing jigsaw puzzle	567
+making horseshoes	568
+cooking scallops	569
+square dancing	570
+getting a piercing	571
+playing ocarina	572
+making paper aeroplanes	573
+playing scrabble	574
+visiting the zoo	575
+crossing eyes	576
+jumping bicycle	577
+throwing water balloon	578
+bodysurfing	579
+pirouetting	580
+luge	581
+spelunking	582
+watching tv	583
+attending conference	584
+curling (sport)	585
+directing traffic	586
+swimming front crawl	587
+ice swimming	588
+battle rope training	589
+putting on mascara	590
+bouncing on bouncy castle	591
+smoking pipe	592
+pillow fight	593
+putting on sari	594
+calligraphy	595
+roasting pig	596
+cracking back	597
+shopping	598
+burping	599
+using bagging machine	600
+staring	601
+shucking oysters	602
+blowdrying hair	603
+smashing	604
+playing laser tag	605
+wading through mud	606
+rope pushdown	607
+preparing salad	608
+making balloon shapes	609
+tagging graffiti	610
+adjusting glasses	611
+using a power drill	612
+trimming shrubs	613
+popping balloons	614
+playing pan pipes	615
+using puppets	616
+arguing	617
+backflip (human)	618
+riding snow blower	619
+hand washing clothes	620
+calculating	621
+gospel singing in church	622
+standing on hands	623
+tasting wine	624
+shaping bread dough	625
+wading through water	626
+falling off chair	627
+throwing snowballs	628
+building sandcastle	629
+land sailing	630
+tying shoe laces	631
+jumping jacks	632
+wood burning (art)	633
+putting on foundation	634
+putting on shoes	635
+cumbia	636
+archaeological excavation	637
+mountain climber (exercise)	638
+assembling bicycle	639
+head stand	640
+cutting apple	641
+shuffling feet	642
+bottling	643
+breathing fire	644
+using inhaler	645
+historical reenactment	646
+hugging baby	647
+mushroom foraging	648
+delivering mail	649
+laying tiles	650
+using atm	651
+chopping meat	652
+tightrope walking	653
+mosh pit dancing	654
+photobombing	655
+coloring in	656
+huddling	657
+playing gong	658
+laying concrete	659
+breaking boards	660
+acting in play	661
+base jumping	662
+tie dying	663
+using a sledge hammer	664
+playing ping pong	665
+photocopying	666
+winking	667
+waking up	668
+swinging baseball bat	669
+twiddling fingers	670
+playing polo	671
+longboarding	672
+ironing hair	673
+bathing dog	674
+moon walking	675
+playing marbles	676
+embroidering	677
+playing beer pong	678
+home roasting coffee	679
+gold panning	680
+karaoke	681
+changing gear in car	682
+raising eyebrows	683
+yarn spinning	684
+scrubbing face	685
+fidgeting	686
+planing wood	687
+cosplaying	688
+capsizing	689
+tackling	690
+shining flashlight	691
+dyeing eyebrows	692
+drooling	693
+alligator wrestling	694
+playing blackjack	695
+carving ice	696
+playing maracas	697
+opening refrigerator	698
+throwing knife	699
+putting in contact lenses	700
+passing soccer ball	701
+casting fishing line	702
+sucking lolly	703
+installing carpet	704
+bulldozing	705
+roasting marshmallows	706
+playing darts	707
+chopping vegetables	708
+bull fighting	709
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k400.json b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json
new file mode 100644
index 0000000000..f97fa4d49f
--- /dev/null
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json
@@ -0,0 +1 @@
+[341, 158, 189, 16, 398, 302, 202, 318, 80, 323, 249, 315, 18, 88, 365, 52, 257, 103, 113, 162, 75, 338, 388, 352, 308, 125, 159, 82, 10, 44, 92, 396, 185, 258, 383, 178, 71, 260, 15, 335, 192, 326, 58, 133, 172, 120, 334, 280, 306, 101, 337, 173, 203, 356, 4, 209, 332, 7, 65, 115, 95, 81, 232, 344, 303, 201, 342, 351, 165, 397, 252, 368, 285, 244, 363, 355, 79, 268, 110, 343, 72, 219, 321, 208, 345, 340, 84, 61, 206, 188, 62, 55, 29, 237, 2, 286, 245, 90, 8, 372, 325, 380, 226, 274, 346, 354, 97, 28, 246, 194, 212, 26, 281, 147, 215, 264, 30, 14, 301, 275, 66, 265, 224, 104, 121, 357, 117, 54, 107, 279, 109, 122, 289, 78, 59, 241, 179, 291, 349, 142, 152, 220, 311, 386, 145, 239, 392, 99, 266, 100, 176, 314, 167, 64, 160, 216, 49, 207, 222, 184, 171, 22, 234, 148, 339, 218, 294, 324, 233, 262, 9, 377, 41, 390, 53, 150, 361, 73, 247, 96, 60, 364, 298, 70, 395, 143, 236, 336, 196, 385, 33, 144, 1, 307, 393, 256, 263, 375, 235, 273, 243, 106, 366, 271, 186, 287, 51, 299, 175, 276, 369, 57, 11, 373, 35, 163, 297, 195, 399, 290, 382, 319, 134, 40, 310, 223, 151, 270, 3, 387, 137, 31, 309, 217, 17, 374, 190, 277, 327, 135, 394, 50, 284, 177, 67, 379, 141, 353, 108, 37, 136, 197, 272, 21, 312, 213, 164, 182, 250, 91, 89, 253, 199, 333, 248, 63, 119, 0, 130, 102, 32, 227, 362, 296, 23, 47, 156, 180, 183, 313, 5, 350, 389, 328, 112, 93, 378, 359, 83, 282, 174, 371, 48, 360, 24, 376, 68, 42, 221, 140, 181, 118, 116, 381, 94, 77, 27, 45, 87, 230, 292, 76, 39, 169, 131, 19, 126, 367, 105, 114, 193, 210, 305, 149, 98, 259, 200, 12, 320, 254, 146, 278, 242, 261, 36, 293, 251, 214, 25, 304, 204, 157, 255, 111, 229, 283, 128, 161, 170, 86, 74, 138, 6, 198, 384, 187, 155, 348, 154, 166, 124, 205, 132, 13, 34, 225, 43, 347, 228, 358, 38, 127, 231, 316, 269, 288, 139, 168, 46, 238, 317, 69, 211, 123, 391, 330, 295, 322, 329, 129, 240, 153, 267, 85, 300, 20, 191, 56, 370, 331]
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k600.json b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json
new file mode 100644
index 0000000000..f0d3b1b0e9
--- /dev/null
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json
@@ -0,0 +1 @@
+[341, 661, 611, 158, 694, 189, 16, 398, 637, 302, 617, 202, 318, 639, 80, 584, 323, 618, 315, 88, 365, 52, 662, 674, 589, 257, 103, 113, 162, 75, 338, 388, 352, 603, 545, 308, 125, 159, 82, 10, 579, 44, 643, 591, 92, 396, 185, 258, 383, 660, 644, 178, 71, 260, 15, 522, 629, 335, 709, 705, 192, 599, 326, 621, 595, 58, 133, 689, 502, 504, 172, 120, 696, 334, 702, 280, 306, 101, 337, 173, 682, 203, 356, 4, 209, 505, 529, 514, 652, 708, 332, 548, 7, 65, 115, 81, 232, 344, 303, 201, 342, 351, 165, 656, 521, 397, 563, 368, 285, 244, 569, 688, 363, 355, 597, 512, 79, 268, 576, 110, 343, 636, 585, 72, 641, 219, 496, 321, 208, 345, 340, 84, 61, 206, 188, 649, 55, 586, 29, 237, 547, 2, 286, 567, 245, 90, 8, 372, 226, 274, 346, 693, 354, 97, 508, 28, 692, 246, 194, 212, 26, 281, 147, 215, 264, 30, 14, 301, 677, 66, 265, 224, 506, 627, 104, 121, 357, 517, 686, 54, 564, 107, 554, 279, 524, 109, 122, 289, 78, 59, 241, 559, 349, 571, 142, 152, 680, 220, 311, 386, 622, 145, 239, 392, 99, 266, 620, 640, 100, 176, 314, 167, 646, 64, 160, 216, 679, 49, 207, 657, 222, 647, 184, 171, 22, 234, 148, 339, 588, 18, 704, 218, 673, 294, 500, 324, 233, 262, 9, 377, 577, 41, 632, 390, 681, 53, 150, 361, 73, 247, 96, 630, 60, 494, 364, 659, 495, 650, 501, 552, 543, 519, 555, 298, 672, 560, 581, 70, 395, 143, 609, 499, 561, 568, 336, 573, 196, 385, 33, 144, 236, 1, 549, 307, 393, 256, 544, 263, 375, 675, 235, 654, 273, 638, 243, 106, 648, 539, 366, 271, 526, 186, 698, 532, 550, 287, 51, 299, 175, 276, 701, 369, 57, 179, 11, 373, 655, 666, 35, 593, 513, 580, 687, 163, 297, 195, 399, 290, 382, 319, 678, 695, 40, 310, 223, 151, 270, 3, 707, 387, 531, 137, 535, 31, 658, 309, 558, 217, 17, 374, 190, 277, 605, 525, 697, 676, 327, 542, 572, 135, 394, 615, 50, 523, 665, 284, 671, 177, 515, 67, 574, 379, 141, 353, 108, 37, 136, 197, 533, 272, 562, 21, 492, 614, 498, 608, 312, 213, 164, 182, 250, 91, 89, 253, 199, 540, 333, 700, 503, 634, 556, 590, 594, 635, 683, 248, 63, 119, 507, 0, 130, 102, 32, 362, 296, 23, 619, 47, 156, 706, 596, 180, 183, 313, 5, 528, 607, 350, 389, 328, 112, 551, 557, 93, 553, 685, 378, 536, 359, 537, 83, 282, 625, 174, 371, 48, 360, 24, 691, 376, 68, 42, 598, 221, 140, 602, 118, 642, 116, 381, 94, 325, 77, 27, 45, 230, 87, 292, 76, 497, 39, 169, 131, 19, 510, 604, 193, 126, 367, 592, 105, 114, 210, 305, 149, 98, 259, 582, 200, 12, 254, 570, 146, 623, 601, 534, 278, 242, 261, 36, 703, 251, 214, 25, 304, 204, 157, 587, 255, 669, 229, 283, 518, 690, 610, 128, 538, 170, 86, 74, 138, 6, 198, 624, 384, 187, 530, 155, 348, 154, 699, 628, 493, 578, 166, 663, 653, 509, 124, 205, 13, 34, 225, 613, 43, 347, 670, 228, 358, 38, 631, 127, 231, 565, 541, 612, 664, 566, 651, 600, 511, 645, 616, 269, 288, 520, 575, 606, 626, 168, 668, 46, 546, 238, 317, 69, 211, 583, 123, 391, 330, 527, 295, 322, 329, 129, 240, 516, 153, 267, 85, 667, 633, 300, 20, 191, 684, 56, 370, 331]
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k700.json b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json
new file mode 100644
index 0000000000..784fa00f71
--- /dev/null
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json
@@ -0,0 +1 @@
+[341, 661, 611, 158, 694, 189, 16, 398, 637, 302, 617, 202, 318, 447, 639, 80, 584, 323, 249, 618, 315, 88, 365, 52, 662, 674, 589, 257, 103, 453, 477, 113, 162, 75, 338, 388, 352, 439, 603, 545, 308, 125, 159, 82, 10, 579, 44, 643, 484, 591, 92, 396, 185, 258, 383, 660, 435, 644, 178, 419, 71, 260, 15, 522, 629, 335, 705, 192, 599, 326, 621, 595, 58, 133, 689, 502, 504, 172, 436, 120, 696, 450, 334, 431, 702, 280, 306, 101, 337, 173, 682, 203, 356, 475, 4, 458, 209, 505, 529, 514, 652, 332, 548, 7, 65, 115, 81, 232, 344, 303, 201, 342, 351, 165, 403, 656, 521, 397, 563, 252, 368, 285, 244, 569, 688, 406, 363, 355, 597, 512, 79, 268, 470, 576, 110, 343, 636, 585, 418, 72, 641, 451, 219, 496, 321, 208, 345, 340, 84, 61, 206, 415, 188, 479, 649, 62, 55, 586, 29, 237, 547, 2, 286, 567, 245, 90, 405, 8, 372, 226, 274, 346, 693, 354, 97, 508, 28, 692, 246, 194, 212, 26, 281, 147, 215, 264, 409, 30, 14, 301, 677, 402, 275, 66, 265, 224, 506, 627, 104, 121, 357, 517, 686, 456, 117, 54, 564, 107, 554, 445, 279, 524, 109, 122, 289, 78, 59, 241, 291, 559, 349, 571, 142, 152, 680, 220, 311, 386, 622, 145, 422, 239, 392, 99, 266, 620, 640, 100, 176, 404, 486, 473, 314, 167, 646, 64, 160, 216, 679, 49, 207, 657, 222, 647, 184, 171, 22, 234, 148, 339, 588, 18, 704, 218, 673, 294, 500, 324, 233, 262, 9, 377, 577, 41, 632, 467, 390, 681, 53, 150, 361, 73, 247, 96, 630, 60, 494, 364, 659, 460, 495, 650, 501, 434, 552, 543, 468, 519, 448, 555, 298, 672, 560, 466, 581, 70, 395, 143, 609, 499, 561, 568, 336, 481, 573, 196, 442, 385, 33, 144, 236, 1, 549, 307, 393, 256, 544, 263, 490, 375, 488, 437, 675, 235, 654, 273, 638, 438, 424, 243, 106, 648, 539, 366, 271, 427, 526, 186, 698, 532, 550, 287, 51, 299, 175, 276, 701, 369, 408, 57, 179, 11, 373, 454, 655, 666, 35, 429, 593, 513, 580, 687, 163, 297, 195, 421, 399, 290, 382, 319, 678, 446, 695, 134, 40, 423, 310, 223, 151, 270, 3, 707, 387, 531, 137, 535, 31, 658, 309, 558, 217, 17, 374, 190, 277, 605, 525, 485, 697, 676, 327, 542, 401, 483, 572, 135, 394, 615, 50, 471, 523, 665, 284, 671, 177, 430, 465, 515, 67, 574, 474, 491, 379, 141, 353, 108, 37, 136, 197, 533, 272, 400, 562, 21, 413, 492, 614, 498, 440, 462, 608, 312, 463, 213, 420, 476, 164, 182, 250, 91, 89, 253, 199, 540, 333, 700, 503, 634, 556, 590, 594, 635, 416, 683, 248, 63, 119, 507, 0, 130, 102, 32, 362, 296, 23, 619, 47, 156, 706, 596, 180, 183, 313, 5, 428, 528, 607, 350, 389, 328, 433, 112, 478, 551, 557, 93, 553, 685, 378, 407, 536, 359, 537, 83, 282, 625, 174, 371, 48, 360, 24, 691, 376, 452, 68, 42, 461, 598, 221, 411, 140, 181, 602, 118, 642, 116, 443, 381, 412, 94, 325, 77, 27, 482, 45, 230, 87, 292, 76, 497, 39, 169, 131, 19, 510, 432, 604, 193, 126, 367, 592, 105, 114, 210, 305, 149, 98, 259, 582, 449, 200, 455, 12, 320, 254, 570, 146, 426, 425, 457, 623, 601, 534, 464, 278, 242, 261, 36, 703, 251, 214, 441, 25, 304, 204, 157, 587, 489, 487, 255, 669, 229, 283, 518, 690, 610, 128, 414, 538, 170, 86, 74, 138, 6, 198, 624, 384, 187, 530, 155, 348, 154, 699, 628, 493, 578, 166, 663, 653, 509, 124, 205, 132, 13, 34, 459, 225, 613, 43, 347, 670, 228, 358, 38, 631, 127, 417, 231, 565, 541, 612, 664, 566, 651, 600, 511, 645, 480, 616, 269, 288, 472, 520, 575, 606, 626, 168, 668, 469, 46, 546, 444, 238, 317, 69, 211, 583, 123, 391, 330, 527, 410, 295, 322, 329, 129, 240, 516, 153, 267, 85, 667, 633, 300, 20, 191, 684, 56, 370, 331]
diff --git a/configs/recognition/uniformerv2/metafile.yml b/configs/recognition/uniformerv2/metafile.yml
index acd35d3443..bf99abe094 100644
--- a/configs/recognition/uniformerv2/metafile.yml
+++ b/configs/recognition/uniformerv2/metafile.yml
@@ -6,26 +6,49 @@ Collections:
     Title: "UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer"
 
 Models:
-  - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb
-    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
+  - Name: uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py
     In Collection: UniFormer
     Metadata:
       Architecture: UniFormerV2-B/16
+      Batch Size: 32
+      Pretrained: CLIP-400M
+      Frame: 8
+      Sampling method: Uniform
+      Resolution: 224x224
+      Training Data: Kinetics-400
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Results:
+    - Dataset: Kinetics-400
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 84.3
+        Top 5 Accuracy: 96.4
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth
+
+  - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py
+    In Collection: UniFormer
+    Metadata:
+      Architecture: UniFormerV2-B/16
+      Batch Size: 32
       Pretrained: Kinetics-710
-      Resolution: short-side 320
       Frame: 8
       Sampling method: Uniform
+      Resolution: 224x224
+      Training Data: Kinetics-400
+      Training Resources: 8 GPUs
     Modality: RGB
-    Converted From:
-      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
-      Code: https://github.com/OpenGVLab/UniFormerV2
     Results:
     - Dataset: Kinetics-400
       Task: Action Recognition
       Metrics:
         Top 1 Accuracy: 85.8
         Top 5 Accuracy: 97.1
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth
 
   - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb
     Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
@@ -33,7 +56,7 @@ Models:
     Metadata:
       Architecture: UniFormerV2-L/14
       Pretrained: Kinetics-710
-      Resolution: short-side 320
+      Resolution: 224x224
       Frame: 8
       Sampling method: Uniform
     Modality: RGB
@@ -54,7 +77,7 @@ Models:
     Metadata:
       Architecture: UniFormerV2-L/14
       Pretrained: Kinetics-710
-      Resolution: short-side 320
+      Resolution: 224x224
       Frame: 16
       Sampling method: Uniform
     Modality: RGB
@@ -75,7 +98,7 @@ Models:
     Metadata:
       Architecture: UniFormerV2-L/14
       Pretrained: Kinetics-710
-      Resolution: short-side 320
+      Resolution: 224x224
       Frame: 32
       Sampling method: Uniform
     Modality: RGB
@@ -96,7 +119,7 @@ Models:
     Metadata:
       Architecture: UniFormerV2-L/14@336
       Pretrained: Kinetics-710
-      Resolution: short-side 320
+      Resolution: 224x224
       Frame: 32
       Sampling method: Uniform
     Modality: RGB
@@ -111,14 +134,15 @@ Models:
         Top 5 Accuracy: 98.4
     Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth
 
-  - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb
-    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
+  - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py
     In Collection: UniFormer
     Metadata:
       Architecture: UniFormerV2-B/16
       Pretrained: Kinetics-710
       Frame: 8
       Sampling method: Uniform
+      Training Resources: 8 GPUs
     Modality: RGB
     Converted From:
       Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
@@ -129,7 +153,8 @@ Models:
       Metrics:
         Top 1 Accuracy: 86.4
         Top 5 Accuracy: 97.3
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-c62c4da4.pth
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth
 
   - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb
     Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
@@ -211,14 +236,15 @@ Models:
         Top 5 Accuracy: 98.5
     Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth
 
-  - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb
-    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
+  - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
     In Collection: UniFormer
     Metadata:
       Architecture: UniFormerV2-B/16
-      Pretrained: Kinetics-710
+      Pretrained: CLIP-400M
       Frame: 8
       Sampling method: Uniform
+      Training Resources: 8 GPUs
     Modality: RGB
     Converted From:
       Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
@@ -229,7 +255,30 @@ Models:
       Metrics:
         Top 1 Accuracy: 76.3
         Top 5 Accuracy: 92.9
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-8a7c4ac4.pth
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth
+
+  - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
+    In Collection: UniFormer
+    Metadata:
+      Architecture: UniFormerV2-B/16
+      Pretrained: Kinetics-710
+      Frame: 8
+      Sampling method: Uniform
+      Training Resources: 8 GPUs
+    Modality: RGB
+    Converted From:
+      Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+      Code: https://github.com/OpenGVLab/UniFormerV2
+    Results:
+    - Dataset: Kinetics-700
+      Task: Action Recognition
+      Metrics:
+        Top 1 Accuracy: 75.9
+        Top 5 Accuracy: 92.9
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth
 
   - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb
     Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
@@ -353,14 +402,15 @@ Models:
       Code: https://github.com/OpenGVLab/UniFormerV2
     Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20221219-55878cdc.pth
 
-  - Name: uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb
-    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
+  - Name: uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb
+    Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py
     In Collection: UniFormer
     Metadata:
       Architecture: UniFormerV2-B/16
       Pretrained: Kinetics-710 + Kinetics-400
       Frame: 8
       Sampling method: Uniform
+      Training Resources: 16 GPUs
     Modality: RGB
     Converted From:
       Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
@@ -369,9 +419,10 @@ Models:
     - Dataset: Moments in Time V1
       Task: Action Recognition
       Metrics:
-        Top 1 Accuracy: 42.7
-        Top 5 Accuracy: 71.6
-    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-fddbc786.pth
+        Top 1 Accuracy: 42.3
+        Top 5 Accuracy: 71.5
+    Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log
+    Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth
 
   - Name: uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb
     Config: configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
index a4cae65831..a6e37c330a 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
@@ -23,7 +23,13 @@
         n_head=12,
         mlp_factor=4.,
         drop_path_rate=0.,
-        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth',  # noqa: E501
+            prefix='backbone.')),
     cls_head=dict(
         type='TimeSformerHead',
         dropout_ratio=0.5,
@@ -38,11 +44,44 @@
 
 # dataset settings
 dataset_type = 'VideoDataset'
-data_root_val = 'data/mit_v1'
-ann_file_test = 'data/mit_v1/val.csv'
+data_root = 'data/mit/videos/training'
+data_root_val = 'data/mit/videos/validation'
+ann_file_train = 'data/mit/mit_train_list_videos.txt'
+ann_file_val = 'data/mit/mit_val_list_videos.txt'
+ann_file_test = 'data/mit/mit_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
 
 test_pipeline = [
-    dict(type='DecordInit'),
+    dict(type='DecordInit', **file_client_args),
     dict(
         type='UniformSample', clip_len=num_frames, num_clips=4,
         test_mode=True),
@@ -53,8 +92,29 @@
     dict(type='PackActionInputs')
 ]
 
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
 test_dataloader = dict(
-    batch_size=32,
+    batch_size=8,
     num_workers=8,
     persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=False),
@@ -63,8 +123,44 @@
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
-        test_mode=True,
-        delimiter=' '))
+        test_mode=True))
 
+val_evaluator = dict(type='AccMetric')
 test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
 test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1 / 20,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        eta_min_ratio=1 / 20,
+        by_epoch=True,
+        begin=5,
+        end=24,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=512)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
index a3eddb0d04..4e47cabb84 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
@@ -23,13 +23,26 @@
         n_head=12,
         mlp_factor=4.,
         drop_path_rate=0.,
-        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='backbone.')),
     cls_head=dict(
-        type='TimeSformerHead',
+        type='UniFormerHead',
         dropout_ratio=0.5,
         num_classes=400,
         in_channels=768,
-        average_clips='prob'),
+        average_clips='prob',
+        channel_map=  # noqa: E251
+        'configs/recognition/uniformerv2/k710_channel_map/map_k400.json',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='cls_head.')),
     data_preprocessor=dict(
         type='ActionDataPreprocessor',
         mean=[114.75, 114.75, 114.75],
@@ -38,11 +51,44 @@
 
 # dataset settings
 dataset_type = 'VideoDataset'
-data_root_val = 'data/k400'
-ann_file_test = 'data/k400/val.csv'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
 
 test_pipeline = [
-    dict(type='DecordInit'),
+    dict(type='DecordInit', **file_client_args),
     dict(
         type='UniformSample', clip_len=num_frames, num_clips=4,
         test_mode=True),
@@ -53,8 +99,29 @@
     dict(type='PackActionInputs')
 ]
 
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
 test_dataloader = dict(
-    batch_size=32,
+    batch_size=8,
     num_workers=8,
     persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=False),
@@ -63,8 +130,45 @@
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
-        test_mode=True,
-        delimiter=','))
+        test_mode=True))
 
+val_evaluator = dict(type='AccMetric')
 test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
 test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-6
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=4,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=1,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
index 4c91589dbb..a9f6f61413 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
@@ -23,13 +23,26 @@
         n_head=12,
         mlp_factor=4.,
         drop_path_rate=0.,
-        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='backbone.')),
     cls_head=dict(
-        type='TimeSformerHead',
+        type='UniFormerHead',
         dropout_ratio=0.5,
         num_classes=600,
         in_channels=768,
-        average_clips='prob'),
+        average_clips='prob',
+        channel_map=  # noqa: E251
+        'configs/recognition/uniformerv2/k710_channel_map/map_k600.json',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='cls_head.')),
     data_preprocessor=dict(
         type='ActionDataPreprocessor',
         mean=[114.75, 114.75, 114.75],
@@ -38,11 +51,44 @@
 
 # dataset settings
 dataset_type = 'VideoDataset'
-data_root_val = 'data/k600'
-ann_file_test = 'data/k600/val.csv'
+data_root = 'data/kinetics600/videos_train'
+data_root_val = 'data/kinetics600/videos_val'
+ann_file_train = 'data/kinetics600/kinetics600_train_list_videos.txt'
+ann_file_val = 'data/kinetics600/kinetics600_val_list_videos.txt'
+ann_file_test = 'data/kinetics600/kinetics600_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
 
 test_pipeline = [
-    dict(type='DecordInit'),
+    dict(type='DecordInit', **file_client_args),
     dict(
         type='UniformSample', clip_len=num_frames, num_clips=4,
         test_mode=True),
@@ -53,8 +99,29 @@
     dict(type='PackActionInputs')
 ]
 
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
 test_dataloader = dict(
-    batch_size=32,
+    batch_size=8,
     num_workers=8,
     persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=False),
@@ -63,8 +130,45 @@
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
-        test_mode=True,
-        delimiter=','))
+        test_mode=True))
 
+val_evaluator = dict(type='AccMetric')
 test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
 test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-6
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=4,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=1,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
index 92494df5d7..5c59ad46f4 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
@@ -23,13 +23,26 @@
         n_head=12,
         mlp_factor=4.,
         drop_path_rate=0.,
-        mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=False,
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='backbone.')),
     cls_head=dict(
-        type='TimeSformerHead',
+        type='UniFormerHead',
         dropout_ratio=0.5,
         num_classes=700,
         in_channels=768,
-        average_clips='prob'),
+        average_clips='prob',
+        channel_map=  # noqa: E251
+        'configs/recognition/uniformerv2/k710_channel_map/map_k700.json',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint=  # noqa: E251
+            'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth',  # noqa: E501
+            prefix='cls_head.')),
     data_preprocessor=dict(
         type='ActionDataPreprocessor',
         mean=[114.75, 114.75, 114.75],
@@ -38,11 +51,44 @@
 
 # dataset settings
 dataset_type = 'VideoDataset'
-data_root_val = 'data/k700'
-ann_file_test = 'data/k700/val.csv'
+data_root = 'data/kinetics700/videos_train'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt'
+ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
 
 test_pipeline = [
-    dict(type='DecordInit'),
+    dict(type='DecordInit', **file_client_args),
     dict(
         type='UniformSample', clip_len=num_frames, num_clips=4,
         test_mode=True),
@@ -53,8 +99,29 @@
     dict(type='PackActionInputs')
 ]
 
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
 test_dataloader = dict(
-    batch_size=32,
+    batch_size=8,
     num_workers=8,
     persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=False),
@@ -63,8 +130,45 @@
         ann_file=ann_file_test,
         data_prefix=dict(video=data_root_val),
         pipeline=test_pipeline,
-        test_mode=True,
-        delimiter=','))
+        test_mode=True))
 
+val_evaluator = dict(type='AccMetric')
 test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
 test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-6
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.5,
+        by_epoch=True,
+        begin=0,
+        end=1,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=4,
+        eta_min_ratio=0.5,
+        by_epoch=True,
+        begin=1,
+        end=5,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics400-rgb.py
new file mode 100644
index 0000000000..6e9c4f3908
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics400-rgb.py
@@ -0,0 +1,163 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=True,
+        pretrained='ViT-B/16'),
+    cls_head=dict(
+        type='UniFormerHead',
+        dropout_ratio=0.5,
+        num_classes=400,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=50,
+        eta_min_ratio=0.1,
+        by_epoch=True,
+        begin=5,
+        end=55,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics700-rgb.py
new file mode 100644
index 0000000000..4a5b41d8c7
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics700-rgb.py
@@ -0,0 +1,163 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+    type='Recognizer3D',
+    backbone=dict(
+        type='UniFormerV2',
+        input_resolution=224,
+        patch_size=16,
+        width=768,
+        layers=12,
+        heads=12,
+        t_size=num_frames,
+        dw_reduction=1.5,
+        backbone_drop_path_rate=0.,
+        temporal_downsample=False,
+        no_lmhra=True,
+        double_lmhra=True,
+        return_list=[8, 9, 10, 11],
+        n_layers=4,
+        n_dim=768,
+        n_head=12,
+        mlp_factor=4.,
+        drop_path_rate=0.,
+        mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+        clip_pretrained=True,
+        pretrained='ViT-B/16'),
+    cls_head=dict(
+        type='UniFormerHead',
+        dropout_ratio=0.5,
+        num_classes=700,
+        in_channels=768,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[114.75, 114.75, 114.75],
+        std=[57.375, 57.375, 57.375],
+        format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics700/videos_train'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt'
+ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(
+        type='PytorchVideoWrapper',
+        op='RandAugment',
+        magnitude=7,
+        num_layers=4),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit', **file_client_args),
+    dict(
+        type='UniformSample', clip_len=num_frames, num_clips=4,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 224)),
+    dict(type='ThreeCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=dict(video=data_root),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=dict(video=data_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+test_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=dict(video=data_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1e-5
+optim_wrapper = dict(
+    optimizer=dict(
+        type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+    clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=5,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=50,
+        eta_min_ratio=0.1,
+        by_epoch=True,
+        begin=5,
+        end=55,
+        convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+    checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/mmaction/models/backbones/uniformer.py b/mmaction/models/backbones/uniformer.py
index 97ac6184c1..5773313778 100644
--- a/mmaction/models/backbones/uniformer.py
+++ b/mmaction/models/backbones/uniformer.py
@@ -495,7 +495,7 @@ class UniFormer(BaseModule):
         attn_drop_rate (float): Attention dropout rate. Defaults to 0.0.
         drop_path_rate (float): Stochastic depth rates.
             Defaults to 0.0.
-        clip_pretrained (bool): Whether to load pretrained CLIP visual encoder.
+        pretrained2d (bool): Whether to load pretrained from 2D model.
             Defaults to True.
         pretrained (str): Name of pretrained model.
             Defaults to None.
@@ -519,7 +519,7 @@ def __init__(
         drop_rate: float = 0.,
         attn_drop_rate: float = 0.,
         drop_path_rate: float = 0.,
-        clip_pretrained: bool = True,
+        pretrained2d: bool = True,
         pretrained: Optional[str] = None,
         init_cfg: Optional[Union[Dict, List[Dict]]] = [
             dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
@@ -529,7 +529,7 @@ def __init__(
         super().__init__(init_cfg=init_cfg)
 
         self.pretrained = pretrained
-        self.clip_pretrained = clip_pretrained
+        self.pretrained2d = pretrained2d
         self.patch_embed1 = SpeicalPatchEmbed(
             img_size=img_size,
             patch_size=4,
@@ -641,7 +641,7 @@ def _load_pretrained(self, pretrained: str = None) -> None:
 
     def init_weights(self):
         """Initialize the weights in backbone."""
-        if self.clip_pretrained:
+        if self.pretrained2d:
             logger = MMLogger.get_current_instance()
             logger.info(f'load model from: {self.pretrained}')
             self._load_pretrained(self.pretrained)
diff --git a/mmaction/models/backbones/uniformerv2.py b/mmaction/models/backbones/uniformerv2.py
index 64b0ba8faf..14571af5bd 100644
--- a/mmaction/models/backbones/uniformerv2.py
+++ b/mmaction/models/backbones/uniformerv2.py
@@ -548,23 +548,24 @@ def _load_pretrained(self, pretrained: str = None) -> None:
             pretrained (str): Model name of pretrained CLIP visual encoder.
                 Defaults to None.
         """
-        if pretrained is not None:
-            model_path = _MODELS[pretrained]
-            logger.info(f'Load CLIP pretrained model from {model_path}')
-            state_dict = _load_checkpoint(model_path, map_location='cpu')
-            state_dict_3d = self.state_dict()
-            for k in state_dict.keys():
-                if k in state_dict_3d.keys(
-                ) and state_dict[k].shape != state_dict_3d[k].shape:
-                    if len(state_dict_3d[k].shape) <= 2:
-                        logger.info(f'Ignore: {k}')
-                        continue
-                    logger.info(f'Inflate: {k}, {state_dict[k].shape}' +
-                                f' => {state_dict_3d[k].shape}')
-                    time_dim = state_dict_3d[k].shape[2]
-                    state_dict[k] = self._inflate_weight(
-                        state_dict[k], time_dim)
-            self.load_state_dict(state_dict, strict=False)
+        assert pretrained is not None, \
+            'please specify clip pretraied checkpoint'
+
+        model_path = _MODELS[pretrained]
+        logger.info(f'Load CLIP pretrained model from {model_path}')
+        state_dict = _load_checkpoint(model_path, map_location='cpu')
+        state_dict_3d = self.state_dict()
+        for k in state_dict.keys():
+            if k in state_dict_3d.keys(
+            ) and state_dict[k].shape != state_dict_3d[k].shape:
+                if len(state_dict_3d[k].shape) <= 2:
+                    logger.info(f'Ignore: {k}')
+                    continue
+                logger.info(f'Inflate: {k}, {state_dict[k].shape}' +
+                            f' => {state_dict_3d[k].shape}')
+                time_dim = state_dict_3d[k].shape[2]
+                state_dict[k] = self._inflate_weight(state_dict[k], time_dim)
+        self.load_state_dict(state_dict, strict=False)
 
     def init_weights(self):
         """Initialize the weights in backbone."""
diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py
index 4cc8d20a4d..5a1b74a9f8 100644
--- a/mmaction/models/heads/__init__.py
+++ b/mmaction/models/heads/__init__.py
@@ -12,10 +12,11 @@
 from .tsm_head import TSMHead
 from .tsn_audio_head import TSNAudioHead
 from .tsn_head import TSNHead
+from .uniformer_head import UniFormerHead
 from .x3d_head import X3DHead
 
 __all__ = [
     'BaseHead', 'GCNHead', 'I3DHead', 'MViTHead', 'OmniHead', 'SlowFastHead',
     'TPNHead', 'TRNHead', 'TSMHead', 'TSNAudioHead', 'TSNHead',
-    'TimeSformerHead', 'X3DHead', 'RGBPoseHead'
+    'TimeSformerHead', 'UniFormerHead', 'RGBPoseHead', 'X3DHead'
 ]
diff --git a/mmaction/models/heads/uniformer_head.py b/mmaction/models/heads/uniformer_head.py
new file mode 100644
index 0000000000..e83b552b93
--- /dev/null
+++ b/mmaction/models/heads/uniformer_head.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmengine.fileio import load
+from mmengine.logging import MMLogger
+from mmengine.runner.checkpoint import _load_checkpoint_with_prefix
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class UniFormerHead(BaseHead):
+    """Classification head for UniFormer. supports loading pretrained
+    Kinetics-710 checkpoint to fine-tuning on other Kinetics dataset.
+
+    A pytorch implement of: `UniFormerV2: Spatiotemporal
+    Learning by Arming Image ViTs with Video UniFormer
+    <https://arxiv.org/abs/2211.09552>`
+
+    Args:
+        num_classes (int): Number of classes to be classified.
+        in_channels (int): Number of channels in input feature.
+        loss_cls (dict or ConfigDict): Config for building loss.
+            Defaults to `dict(type='CrossEntropyLoss')`.
+        dropout_ratio (float): Probability of dropout layer.
+            Defaults to : 0.0.
+        channel_map (str, optional): Channel map file to selecting
+            channels from pretrained head with extra channels.
+            Defaults to None.
+        init_cfg (dict or ConfigDict, optional): Config to control the
+           initialization. Defaults to
+           ``[
+            dict(type='TruncNormal', layer='Linear', std=0.01)
+           ]``.
+        kwargs (dict, optional): Any keyword argument to be used to initialize
+            the head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+                 dropout_ratio: float = 0.0,
+                 channel_map: Optional[str] = None,
+                 init_cfg: Optional[dict] = dict(
+                     type='TruncNormal', layer='Linear', std=0.02),
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes, in_channels, loss_cls, init_cfg=init_cfg, **kwargs)
+        self.channel_map = channel_map
+        self.dropout_ratio = dropout_ratio
+
+        if self.dropout_ratio != 0:
+            self.dropout = nn.Dropout(p=self.dropout_ratio)
+        else:
+            self.dropout = None
+        self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+    def _select_channels(self, stact_dict):
+        selected_channels = load(self.channel_map)
+        for key in stact_dict:
+            stact_dict[key] = stact_dict[key][selected_channels]
+
+    def init_weights(self) -> None:
+        """Initiate the parameters from scratch."""
+        if self.init_cfg['type'] == 'Pretrained':
+            assert self.channel_map is not None, \
+                'load cls_head weights needs to specify the channel map file'
+            logger = MMLogger.get_current_instance()
+            pretrained = self.init_cfg['checkpoint']
+            logger.info(f'load pretrained model from {pretrained}')
+            state_dict = _load_checkpoint_with_prefix(
+                'cls_head.', pretrained, map_location='cpu')
+            self._select_channels(state_dict)
+            msg = self.load_state_dict(state_dict, strict=False)
+            logger.info(msg)
+        else:
+            super().init_weights()
+
+    def forward(self, x: Tensor, **kwargs) -> Tensor:
+        """Defines the computation performed at every call.
+
+        Args:
+            x (Tensor): The input data.
+
+        Returns:
+            Tensor: The classification scores for input samples.
+        """
+        # [N, in_channels]
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # [N, in_channels]
+        cls_score = self.fc_cls(x)
+        # [N, num_classes]
+        return cls_score
diff --git a/tests/models/backbones/test_uniformerv2.py b/tests/models/backbones/test_uniformerv2.py
index 3345892eb7..4858001c4d 100644
--- a/tests/models/backbones/test_uniformerv2.py
+++ b/tests/models/backbones/test_uniformerv2.py
@@ -28,6 +28,7 @@ def test_uniformerv2_backbone():
         n_head=12,
         mlp_factor=4.,
         drop_path_rate=0.,
+        clip_pretrained=False,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5])
     model.init_weights()
 
@@ -56,6 +57,7 @@ def test_uniformerv2_backbone():
         n_head=12,
         mlp_factor=4.,
         drop_path_rate=0.,
+        clip_pretrained=False,
         mlp_dropout=[0.5, 0.5, 0.5, 0.5])
     model.init_weights()
 
diff --git a/tests/models/utils/test_gradcam.py b/tests/models/utils/test_gradcam.py
index d1a39ef87c..4cc8b1b8b0 100644
--- a/tests/models/utils/test_gradcam.py
+++ b/tests/models/utils/test_gradcam.py
@@ -167,7 +167,7 @@ def test_csn():
 
     recognizer = MODELS.build(config.model)
     recognizer.cfg = config
-    input_shape = (1, 1, 3, 32, 32, 32)
+    input_shape = (1, 1, 3, 32, 16, 16)
     target_layer_name = 'backbone/layer4/1/relu'
 
     _do_test_3D_models(recognizer, target_layer_name, input_shape)
@@ -230,6 +230,6 @@ def test_x3d():
     config.model['backbone']['pretrained'] = None
     recognizer = MODELS.build(config.model)
     recognizer.cfg = config
-    input_shape = (1, 1, 3, 13, 32, 32)
+    input_shape = (1, 1, 3, 13, 16, 16)
     target_layer_name = 'backbone/layer4/1/relu'
     _do_test_3D_models(recognizer, target_layer_name, input_shape)
diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py
index e0886162a6..eeeba0d402 100644
--- a/tests/utils/test_misc.py
+++ b/tests/utils/test_misc.py
@@ -1,11 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 import os.path as osp
+import platform
 from tempfile import TemporaryDirectory
 
+import pytest
+
 from mmaction.utils import frame_extract
 
 
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
 def test_frame_extract():
     data_prefix = osp.normpath(osp.join(osp.dirname(__file__), '../data'))
     video_path = osp.join(data_prefix, 'test.mp4')
diff --git a/tests/visualization/test_action_visualizer.py b/tests/visualization/test_action_visualizer.py
index 3c7a1db59d..c86b324af9 100644
--- a/tests/visualization/test_action_visualizer.py
+++ b/tests/visualization/test_action_visualizer.py
@@ -1,5 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
 import decord
+import pytest
 import torch
 from mmengine.structures import LabelData
 
@@ -7,6 +10,7 @@
 from mmaction.visualization import ActionVisualizer
 
 
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
 def test_visualizer():
     video = decord.VideoReader('./demo/demo.mp4')
     video = video.get_batch(range(32)).asnumpy()
diff --git a/tests/visualization/test_video_backend.py b/tests/visualization/test_video_backend.py
index 0de82465ee..c5153d812d 100644
--- a/tests/visualization/test_video_backend.py
+++ b/tests/visualization/test_video_backend.py
@@ -1,11 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 import os.path as osp
+import platform
 import time
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
 import decord
+import pytest
 import torch
 from mmengine.structures import LabelData
 
@@ -16,6 +18,7 @@
 register_all_modules()
 
 
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
 def test_local_visbackend():
     video = decord.VideoReader('./demo/demo.mp4')
     video = video.get_batch(range(32)).asnumpy()
@@ -37,6 +40,7 @@ def test_local_visbackend():
     return
 
 
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
 def test_tensorboard_visbackend():
     video = decord.VideoReader('./demo/demo.mp4')
     video = video.get_batch(range(32)).asnumpy()

From 5f78a1167461434cc3f68b9c37dc0fd4e9d5ca91 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Thu, 6 Apr 2023 20:46:44 +0800
Subject: [PATCH 34/36] [doc] fix docs conflicts (#2342)

---
 .circleci/test.yml                        |   4 +-
 .github/workflows/merge_stage_test.yml    |   8 +-
 .github/workflows/pr_stage_test.yml       |   6 +-
 README.md                                 |  37 +++---
 README_zh-CN.md                           |  16 +--
 docker/Dockerfile                         |   2 +-
 docs/en/advanced_guides/dataflow.md       |   2 +
 docs/en/advanced_guides/depoly.md         |   3 +
 docs/en/api.rst                           | 140 ++++++++++++++++++++++
 docs/en/conf.py                           |   1 +
 docs/en/get_started/faq.md                |   2 +-
 docs/en/get_started/guide_to_framework.md |   2 +-
 docs/en/get_started/installation.md       |   6 +-
 docs/en/get_started/overview.md           |  38 +++---
 docs/en/index.rst                         |   6 +
 docs/en/merge_docs.sh                     |  17 +--
 docs/en/migration.md                      |   4 +-
 docs/en/switch_language.md                |   2 +-
 docs/zh_cn/get_started.md                 |   6 +-
 docs/zh_cn/switch_language.md             |   2 +-
 projects/ctrgcn/README.md                 |   2 +-
 projects/example_project/README.md        |   2 +-
 projects/msg3d/README.md                  |   2 +-
 tests/models/utils/test_gradcam.py        |   2 +-
 24 files changed, 234 insertions(+), 78 deletions(-)
 create mode 100644 docs/en/api.rst

diff --git a/.circleci/test.yml b/.circleci/test.yml
index aafba494dd..8ead3de0a8 100644
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@@ -63,7 +63,7 @@ jobs:
           command: |
             pip install git+ssh://git@github.com/open-mmlab/mmengine.git@main
             pip install -U openmim
-            mim install 'mmcv >= 2.0.0rc1'
+            mim install 'mmcv >= 2.0.0'
             pip install git+ssh://git@github.com/open-mmlab/mmdetection.git@dev-3.x
             pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
             pip install -r requirements.txt
@@ -122,7 +122,7 @@ jobs:
           command: |
             docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmengine.git@main
             docker exec mmaction pip install -U openmim
-            docker exec mmaction mim install 'mmcv >= 2.0.0rc1'
+            docker exec mmaction mim install 'mmcv >= 2.0.0'
             docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmdetection.git@dev-3.x
             docker exec mmaction pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
             docker exec mmaction pip install -r requirements.txt
diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml
index cf1f2ed10c..60df0a1245 100644
--- a/.github/workflows/merge_stage_test.yml
+++ b/.github/workflows/merge_stage_test.yml
@@ -55,7 +55,7 @@ jobs:
       - name: Install MMCV
         run: |
           pip install -U openmim
-          mim install 'mmcv >= 2.0.0rc1'
+          mim install 'mmcv >= 2.0.0'
       - name: Install MMDet
         run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install MMCls
@@ -123,7 +123,7 @@ jobs:
       - name: Install MMCV
         run: |
           pip install -U openmim
-          mim install 'mmcv >= 2.0.0rc1'
+          mim install 'mmcv >= 2.0.0'
       - name: Install MMDet
         run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install MMCls
@@ -187,7 +187,7 @@ jobs:
         run: |
           pip install git+https://github.com/open-mmlab/mmengine.git@main
           pip install -U openmim
-          mim install 'mmcv >= 2.0.0rc1'
+          mim install 'mmcv >= 2.0.0'
           pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
           pip install -r requirements.txt
@@ -225,7 +225,7 @@ jobs:
         run: |
           pip install git+https://github.com/open-mmlab/mmengine.git@main
           pip install -U openmim
-          mim install 'mmcv >= 2.0.0rc1'
+          mim install 'mmcv >= 2.0.0'
           pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
           pip install -r requirements.txt
diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml
index a0eb9d5d00..a8b5c4c7a2 100644
--- a/.github/workflows/pr_stage_test.yml
+++ b/.github/workflows/pr_stage_test.yml
@@ -46,7 +46,7 @@ jobs:
       - name: Install MMCV
         run: |
           pip install -U openmim
-          mim install 'mmcv >= 2.0.0rc1'
+          mim install 'mmcv >= 2.0.0'
       - name: Install MMDet
         run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install MMCls
@@ -114,7 +114,7 @@ jobs:
         run: |
           pip install git+https://github.com/open-mmlab/mmengine.git@main
           pip install -U openmim
-          mim install 'mmcv >= 2.0.0rc1'
+          mim install 'mmcv >= 2.0.0'
           pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
           pip install -r requirements.txt
@@ -161,7 +161,7 @@ jobs:
         run: |
           pip install git+https://github.com/open-mmlab/mmengine.git@main
           pip install -U openmim
-          mim install 'mmcv >= 2.0.0rc1'
+          mim install 'mmcv >= 2.0.0'
           pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
           pip install -r requirements.txt
diff --git a/README.md b/README.md
index 25b703306c..f01bf8cd5f 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
     </sup>
   </div>
 
-[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/1.x/)
+[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/latest/)
 [![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions)
 [![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2)
 [![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/)
@@ -25,10 +25,10 @@
 [![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
 [![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
 
-[📘Documentation](https://mmaction2.readthedocs.io/en/1.x/) |
-[🛠️Installation](https://mmaction2.readthedocs.io/en/1.x/get_started.html) |
-[👀Model Zoo](https://mmaction2.readthedocs.io/en/1.x/modelzoo.html) |
-[🆕Update News](https://mmaction2.readthedocs.io/en/1.x/notes/changelog.html) |
+[📘Documentation](https://mmaction2.readthedocs.io/en/latest/) |
+[🛠️Installation](https://mmaction2.readthedocs.io/en/latest/get_started.html) |
+[👀Model Zoo](https://mmaction2.readthedocs.io/en/latest/modelzoo.html) |
+[🆕Update News](https://mmaction2.readthedocs.io/en/latest/notes/changelog.html) |
 [🚀Ongoing Projects](https://github.com/open-mmlab/mmaction2/projects) |
 [🤔Reporting Issues](https://github.com/open-mmlab/mmaction2/issues/new/choose)
 
@@ -58,21 +58,22 @@ English | [简体中文](/README_zh-CN.md)
 
 ## 📄 Table of Contents
 
+- [📄 Table of Contents](#-table-of-contents)
 - [🥳 🚀 What's New](#--whats-new-)
 - [📖 Introduction](#-introduction-)
 - [🎁 Major Features](#-major-features-)
-- [🛠️ Installation](#-installation-)
+- [🛠️ Installation](#️-installation-)
 - [👀 Model Zoo](#-model-zoo-)
 - [👨‍🏫 Get Started](#-get-started-)
 - [🎫 License](#-license-)
 - [🖊️ Citation](#️-citation-)
 - [🙌 Contributing](#-contributing-)
 - [🤝 Acknowledgement](#-acknowledgement-)
-- [🏗️ Projects in OpenMMLab](#-projects-in-openmmlab-)
+- [🏗️ Projects in OpenMMLab](#️-projects-in-openmmlab-)
 
 ## 🥳 🚀 What's New [🔝](#-table-of-contents)
 
-**The default branch has been switched to `1.x` from `master`, and we encourage users to migrate to the latest version with more supported models, stronger pre-training checkpoints and simpler coding. Please refer to [Migration Guide](https://mmaction2.readthedocs.io/en/1.x/migration.html) for more details.**
+**The default branch has been switched to `1.x` from `master`, and we encourage users to migrate to the latest version with more supported models, stronger pre-training checkpoints and simpler coding. Please refer to [Migration Guide](https://mmaction2.readthedocs.io/en/latest/migration.html) for more details.**
 
 **Release (2023.02.10)**: v1.0.0rc3 with the following new features:
 
@@ -112,7 +113,7 @@ It is a part of the [OpenMMLab](http://openmmlab.com/) project.
 
 MMAction2 depends on [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine), [MMDetection](https://github.com/open-mmlab/mmdetection) (optional, for spatial-temporal detection tasks) and [MMPose](https://github.com/open-mmlab/mmpose) (optional, for skeleton based tasks).
 
-Please refer to [install.md](https://mmaction2.readthedocs.io/en/1.x/get_started.html) for detailed instructions.
+Please refer to [install.md](https://mmaction2.readthedocs.io/en/latest/get_started.html) for detailed instructions.
 
 <details close>
 <summary>Quick instructions</summary>
@@ -122,9 +123,9 @@ conda create --name openmmlab python=3.8 -y
 conda activate open-mmlab
 conda install pytorch torchvision -c pytorch  # This command will automatically install the latest version PyTorch and cudatoolkit, please check whether they match your environment.
 pip install -U openmim
-mim install mmengine 'mmcv>=2.0.0rc1'
-mim install "mmdet>=3.0.0rc5"  # optional
-mim install "mmpose>=1.0.0rc0"  # optional
+mim install mmengine 'mmcv>=2.0.0'
+mim install "mmdet>=3.0.0"  # optional
+mim install "mmpose>=1.0.0"  # optional
 git clone https://github.com/open-mmlab/mmaction2.git
 cd mmaction2
 git checkout 1.x
@@ -135,7 +136,7 @@ pip3 install -e .
 
 ## 👀 Model Zoo [🔝](#-table-of-contents)
 
-Results and models are available in the [model zoo](https://mmaction2.readthedocs.io/en/1.x/modelzoo.html).
+Results and models are available in the [model zoo](https://mmaction2.readthedocs.io/en/latest/modelzoo.html).
 
 <details close>
 
@@ -284,11 +285,11 @@ Results and models are available in the [model zoo](https://mmaction2.readthedoc
 
 For tutorials, we provide the following user guides for basic usage:
 
-- [Migration from MMAction2 0.X](https://mmaction2.readthedocs.io/en/1.x/migration.html)
-- [Learn about Configs](https://mmaction2.readthedocs.io/en/1.x/user_guides/1_config.html#)
-- [Prepare Datasets](https://mmaction2.readthedocs.io/en/1.x/user_guides/2_data_prepare.html)
-- [Inference with Existing Models](https://mmaction2.readthedocs.io/en/1.x/user_guides/3_inference.html)
-- [Training and Testing](https://mmaction2.readthedocs.io/en/1.x/user_guides/4_train_test.html)
+- [Migration from MMAction2 0.X](https://mmaction2.readthedocs.io/en/latest/migration.html)
+- [Learn about Configs](https://mmaction2.readthedocs.io/en/latest/user_guides/1_config.html#)
+- [Prepare Datasets](https://mmaction2.readthedocs.io/en/latest/user_guides/2_data_prepare.html)
+- [Inference with Existing Models](https://mmaction2.readthedocs.io/en/latest/user_guides/3_inference.html)
+- [Training and Testing](https://mmaction2.readthedocs.io/en/latest/user_guides/4_train_test.html)
 
 <details close>
 <summary>Research works built on MMAction2 by users from community</summary>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index c2ffb09702..5b03609b9d 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -17,7 +17,7 @@
     </sup>
   </div>
 
-[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/1.x/)
+[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/latest/)
 [![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions)
 [![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2)
 [![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/)
@@ -25,9 +25,9 @@
 [![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
 [![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
 
-[📘文档](https://mmaction2.readthedocs.io/zh_CN//1.x/) |
-[🛠️安装指南](https://mmaction2.readthedocs.io/zh_CN//1.x/get_started.html) |
-[👀模型库](https://mmaction2.readthedocs.io/zh_CN//1.x/modelzoo.html) |
+[📘文档](https://mmaction2.readthedocs.io/zh_CN/1.x/) |
+[🛠️安装指南](https://mmaction2.readthedocs.io/zh_CN/1.x/get_started.html) |
+[👀模型库](https://mmaction2.readthedocs.io/zh_CN/1.x/modelzoo.html) |
 [🆕更新](https://mmaction2.readthedocs.io/zh_CN/1.x/notes/changelog.html) |
 [🚀进行中项目](https://github.com/open-mmlab/mmaction2/projects) |
 [🤔问题反馈](https://github.com/open-mmlab/mmaction2/issues/new/choose)
@@ -107,9 +107,9 @@ conda create --name openmmlab python=3.8 -y
 conda activate open-mmlab
 conda install pytorch torchvision -c pytorch  # 以上命令将自动安装最新版本的 PyTorch 和 cudatoolkit,请检查它们是否和你的环境匹配
 pip install -U openmim
-mim install mmengine 'mmcv>=2.0.0rc1'
-mim install "mmdet>=3.0.0rc5"  # 可选
-mim install "mmpose>=1.0.0rc0"  # 可选
+mim install mmengine 'mmcv>=2.0.0'
+mim install "mmdet>=3.0.0"  # 可选
+mim install "mmpose>=1.0.0"  # 可选
 git clone https://github.com/open-mmlab/mmaction2.git
 cd mmaction2
 git checkout 1.x
@@ -256,7 +256,7 @@ MMAction2 将跟进学界的最新进展，并支持更多算法和框架。如
 
 ## 数据集准备
 
-请参考 [数据准备](https://mmaction2.readthedocs.io/en/1.x/user_guides/2_data_prepare.html) 了解数据集准备概况。所有支持的数据集都列于 [数据集清单](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 中。
+请参考 [数据准备](https://mmaction2.readthedocs.io/en/latest/user_guides/2_data_prepare.html) 了解数据集准备概况。所有支持的数据集都列于 [数据集清单](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 中。
 
 ## FAQ
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 45c82cfcb7..6622f147ea 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -17,7 +17,7 @@ RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 lib
 
 # Install MMCV
 RUN pip install openmim
-RUN mim install mmengine "mmcv>=2.0rc1"
+RUN mim install mmengine "mmcv>=2.0.0"
 
 # Install MMAction2
 RUN conda clean --all
diff --git a/docs/en/advanced_guides/dataflow.md b/docs/en/advanced_guides/dataflow.md
index 0cc136162a..5723cc1557 100644
--- a/docs/en/advanced_guides/dataflow.md
+++ b/docs/en/advanced_guides/dataflow.md
@@ -1 +1,3 @@
 # Dataflow in MMAction2
+
+coming soon...
diff --git a/docs/en/advanced_guides/depoly.md b/docs/en/advanced_guides/depoly.md
index e69de29bb2..58e9f58ea4 100644
--- a/docs/en/advanced_guides/depoly.md
+++ b/docs/en/advanced_guides/depoly.md
@@ -0,0 +1,3 @@
+# How to deploy MMAction2 models
+
+coming soon...
diff --git a/docs/en/api.rst b/docs/en/api.rst
new file mode 100644
index 0000000000..4431c7734b
--- /dev/null
+++ b/docs/en/api.rst
@@ -0,0 +1,140 @@
+mmaction.apis
+--------------
+.. automodule:: mmaction.apis
+    :members:
+
+mmaction.datasets
+--------------
+
+datasets
+^^^^^^^^^^
+.. automodule:: mmaction.datasets
+    :members:
+
+transforms
+^^^^^^^^^^^^
+.. automodule:: mmaction.datasets.transforms
+    :members:
+
+mmaction.engine
+--------------
+
+hooks
+^^^^^^^^^^
+.. automodule:: mmaction.engine.hooks
+    :members:
+
+optimizers
+^^^^^^^^^^^^^^^
+.. automodule:: mmaction.engine.optimizers
+    :members:
+
+runner
+^^^^^^^^^^
+.. automodule:: mmaction.engine.runner
+    :members:
+
+
+mmaction.evaluation
+--------------------
+
+functional
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.evaluation.functional
+    :members:
+
+metrics
+^^^^^^^^^^
+.. automodule:: mmaction.evaluation.metrics
+    :members:
+
+
+mmaction.models
+--------------
+
+backbones
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.backbones
+    :members:
+
+common
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.common
+    :members:
+
+data_preprocessors
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.data_preprocessors
+    :members:
+
+heads
+^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.heads
+    :members:
+
+localizers
+^^^^^^^^^^
+.. automodule:: mmaction.models.localizers
+    :members:
+
+
+losses
+^^^^^^^^^^
+.. automodule:: mmaction.models.losses
+    :members:
+
+necks
+^^^^^^^^^^^^
+.. automodule:: mmaction.models.necks
+    :members:
+
+roi_heads
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.roi_heads
+    :members:
+
+recognizers
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.seg_heads
+    :members:
+
+task_modules
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.task_modules
+    :members:
+
+
+utils
+^^^^^^^^^^
+.. automodule:: mmaction.models.utils
+    :members:
+
+
+mmaction.structures
+--------------------
+
+structures
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.structures
+    :members:
+
+bbox
+^^^^^^^^^^
+.. automodule:: mmaction.structures.bbox
+    :members:
+
+
+mmaction.testing
+----------------
+.. automodule:: mmaction.testing
+    :members:
+
+mmaction.visualization
+--------------------
+.. automodule:: mmaction.visualization
+    :members:
+
+mmaction.utils
+--------------
+.. automodule:: mmaction.utils
+    :members:
diff --git a/docs/en/conf.py b/docs/en/conf.py
index 6ff7f10029..ba54a05953 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -124,6 +124,7 @@ def get_version():
 html_css_files = ['css/readthedocs.css']
 
 myst_enable_extensions = ['colon_fence']
+myst_heading_anchors = 3
 
 
 def builder_inited_handler(app):
diff --git a/docs/en/get_started/faq.md b/docs/en/get_started/faq.md
index 2cbe7787b3..7ef9cdd53e 100644
--- a/docs/en/get_started/faq.md
+++ b/docs/en/get_started/faq.md
@@ -30,7 +30,7 @@ If the contents here do not cover your issue, please create an issue using the [
 
 - **"Why I got the error message 'Please install XXCODEBASE to use XXX' even if I have already installed XXCODEBASE?"**
 
-  You got that error message because our project failed to import a function or a class from XXCODEBASE. You can try to run the corresponding line to see what happens. One possible reason is, for some codebases in OpenMMLAB, you need to install mmcv and mmengine before you install them. You could follow this [tutorial](https://mmaction2.readthedocs.io/en/1.x/get_started.html#installation) to install them.
+  You got that error message because our project failed to import a function or a class from XXCODEBASE. You can try to run the corresponding line to see what happens. One possible reason is, for some codebases in OpenMMLAB, you need to install mmcv and mmengine before you install them. You could follow this [tutorial](https://mmaction2.readthedocs.io/en/latest/get_started.html#installation) to install them.
 
 ## Data
 
diff --git a/docs/en/get_started/guide_to_framework.md b/docs/en/get_started/guide_to_framework.md
index 68f8bdfd41..ab66ba196f 100644
--- a/docs/en/get_started/guide_to_framework.md
+++ b/docs/en/get_started/guide_to_framework.md
@@ -738,7 +738,7 @@ for epoch in range(max_epochs):
 
 ## Step6: Train and Test with MMEngine (Recommended)
 
-For more details on training and testing, you can refer to [MMAction2 Tutorial](https://mmaction2.readthedocs.io/en/1.x/user_guides/4_train_test.html). For more information on `Runner`, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html).
+For more details on training and testing, you can refer to [MMAction2 Tutorial](https://mmaction2.readthedocs.io/en/latest/user_guides/4_train_test.html). For more information on `Runner`, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html).
 
 ```python
 from mmengine.runner import Runner
diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
index 9d48be6030..294bf17443 100644
--- a/docs/en/get_started/installation.md
+++ b/docs/en/get_started/installation.md
@@ -45,7 +45,7 @@ We recommend that users follow our best practices to install MMAction2. However,
 
 ```shell
 pip install -U openmim
-mim install mmengine 'mmcv>=2.0.0rc1'
+mim install mmengine 'mmcv>=2.0.0'
 ```
 
 **Step 2.** Install MMAction2.
@@ -80,7 +80,7 @@ git checkout dev-1.x
 Just install with pip.
 
 ```shell
-pip install "mmaction2>=1.0rc0"
+pip install "mmaction2>=1.0.0"
 ```
 
 ## Verify the installation
@@ -167,7 +167,7 @@ This requires manually specifying a find-url based on PyTorch version and its CU
 For example, the following command install mmcv built for PyTorch 1.10.x and CUDA 11.3.
 
 ```shell
-pip install 'mmcv>=2.0.0rc1' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
+pip install 'mmcv>=2.0.0' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
 ```
 
 ### Install on CPU-only platforms
diff --git a/docs/en/get_started/overview.md b/docs/en/get_started/overview.md
index 4857991711..0ddc07b275 100644
--- a/docs/en/get_started/overview.md
+++ b/docs/en/get_started/overview.md
@@ -32,66 +32,66 @@ We have prepared a wealth of documents to meet your various needs:
 <details open>
 <summary><b>For the basic usage of MMAction2</b></summary>
 
-- [Installation](docs/en/get_started/installation.md)
-- [Quick Run](docs/en/get_started/quick_run.md)
-- [Inference](docs/en/user_guides/Inference.md)
+- [Installation](installation.md)
+- [Quick Run](quick_run.md)
+- [Inference](../user_guides/Inference.md)
 
 </details>
 
 <details open>
 <summary><b>For training on supported dataset</b></summary>
 
-- [learn about configs](docs/en/user_guides/config.md)
-- [prepare dataset](docs/en/get_started/prepare_dataset.md)
-- [Training and testing](docs/en/user_guides/train_test.md)
+- [learn about configs](../user_guides/config.md)
+- [prepare dataset](../user_guides/prepare_dataset.md)
+- [Training and testing](../user_guides/train_test.md)
 
 </details>
 
 <details open>
 <summary><b>For looking for some common issues</b></summary>
 
-- [FAQs](docs/en/get_started/faq.md)
-- [Useful tools](docs/en/useful_tools.md)
+- [FAQs](faq.md)
+- [Useful tools](../useful_tools.md)
 
 </details>
 
 <details open>
 <summary><b>For a general understanding about MMAction2</b></summary>
 
-- [20-minute tour to MMAction2](docs/en/get_started/20-minute_tour.md)
-- [Data flow in MMAction2](docs/en/advanced_guides/dataflow.md)
+- [20-minute tour to MMAction2](guide_to_framework.md)
+- [Data flow in MMAction2](../advanced_guides/dataflow.md)
 
 </details>
 
 <details open>
 <summary><b>For advanced usage about custom training</b></summary>
 
-- [Customize models](docs/en/advanced_guides/customize_models.md)
-- [Customize datasets](docs/en/advanced_guides/customize_dataset.md)
-- [Customize data transformation and augmentation](docs/en/advanced_guides/customize_pipeline.md)
-- [Customize optimizer and scheduler](docs/en/advanced_guides/customize_optimizer.md)
-- [Customize logging](docs/en/advanced_guides/customize_logging.md)
+- [Customize models](../advanced_guides/customize_models.md)
+- [Customize datasets](../advanced_guides/customize_dataset.md)
+- [Customize data transformation and augmentation](../advanced_guides/customize_pipeline.md)
+- [Customize optimizer and scheduler](../advanced_guides/customize_optimizer.md)
+- [Customize logging](../advanced_guides/customize_logging.md)
 
 </details>
 
 <details open>
 <summary><b>For supported model zoo and dataset zoo</b></summary>
 
-- [Model Zoo](model_zoo/modelzoo.md)
-- [Dataset Zoo](datasetzoo.md)
+- [Model Zoo](../model_zoo/modelzoo.md)
+- [Dataset Zoo](../datasetzoo.md)
 
 </details>
 
 <details open>
 <summary><b>For migration from MMAction2 0.x</b></summary>
 
-- [Migration](migration.md)
+- [Migration](../migration.md)
 
 </details>
 
 <details open>
 <summary><b>For researchers and developers who are willing to contribute to MMAction2</b></summary>
 
-- [Contribution Guide](get_started/contribution_guide.md)
+- [Contribution Guide](contribution_guide.md)
 
 </details>
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 73a4590f00..ed4062534e 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -46,6 +46,12 @@ You can switch between Chinese and English documents in the lower-left corner of
 
    migration.md
 
+.. toctree::
+   :maxdepth: 1
+   :caption: API Reference
+
+   api.rst
+
 .. toctree::
    :maxdepth: 1
    :caption: Model Zoo
diff --git a/docs/en/merge_docs.sh b/docs/en/merge_docs.sh
index 5a3c86b7ac..0d3c90ef0e 100644
--- a/docs/en/merge_docs.sh
+++ b/docs/en/merge_docs.sh
@@ -2,16 +2,18 @@
 
 # gather models
 mkdir -p model_zoo
-cat  ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\/en/](../g' |sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/localization_models.md
-cat  ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/recognition_models.md
-cat  ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" >>  model_zoo/recognition_models.md
-cat  ../../configs/detection/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/detection_models.md
-cat  ../../configs/skeleton/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/skeleton_models.md
+cat  ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' |sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/localization_models.md
+cat  ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/recognition_models.md
+cat  ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >>  model_zoo/recognition_models.md
+cat  ../../configs/detection/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/detection_models.md
+cat  ../../configs/skeleton/*/README.md  | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >  model_zoo/skeleton_models.md
 
 # gather projects
 # TODO: generate table of contents for project zoo
 cat ../../projects/README.md > projectzoo.md
-cat ../../projects/*/README.md >> projectzoo.md
+cat ../../projects/example_project/README.md >> projectzoo.md
+cat ../../projects/ctrgcn/README.md >> projectzoo.md
+cat ../../projects/msg3d/README.md >> projectzoo.md
 
 # gather datasets
 cat supported_datasets.md > datasetzoo.md
@@ -38,8 +40,9 @@ sed -i 's/(\/tools\/data\/skeleton\/README.md/(#skeleton-dataset/g' datasetzoo.m
 
 cat prepare_data.md >> datasetzoo.md
 
+ sed -i 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' *.md
+
 sed -i 's/](\/docs\/en\//](g' datasetzoo.md
 sed -i 's/](\/docs\/en\//](g' changelog.md
-
 sed -i 's/](\/docs\/en\//](..g' ./get_stated/*.md
 sed -i 's/](\/docs\/en\//](..g' ./tutorials/*.md
diff --git a/docs/en/migration.md b/docs/en/migration.md
index 2917455f80..ea2ecac06c 100644
--- a/docs/en/migration.md
+++ b/docs/en/migration.md
@@ -4,10 +4,10 @@ MMAction2 1.x introduced major refactorings and modifications including some BC-
 
 ## New dependencies
 
-MMAction2 1.x depends on the following packages. You are recommended to prepare a new clean environment and install them according to [install tutorial](./get_started.md)
+MMAction2 1.x depends on the following packages. You are recommended to prepare a new clean environment and install them according to [install tutorial](./get_started/installation.md)
 
 1. [MMEngine](https://github.com/open-mmlab/mmengine): MMEngine is a foundational library for training deep learning model introduced in OpenMMLab 2.0 architecture.
-2. [MMCV](https://github.com/open-mmlab/mmcv): MMCV is a foundational library for computer vision. MMAction2 1.x requires `mmcv>=2.0.0rc0` which is more compact and efficient than `mmcv-full==1.x`.
+2. [MMCV](https://github.com/open-mmlab/mmcv): MMCV is a foundational library for computer vision. MMAction2 1.x requires `mmcv>=2.0.0` which is more compact and efficient than `mmcv-full==2.0.0`.
 
 ## Configuration files
 
diff --git a/docs/en/switch_language.md b/docs/en/switch_language.md
index 0009eafa9e..80cf0dc571 100644
--- a/docs/en/switch_language.md
+++ b/docs/en/switch_language.md
@@ -1,3 +1,3 @@
-## <a href='https://mmaction2.readthedocs.io/en/1.x/'>English</a>
+## <a href='https://mmaction2.readthedocs.io/en/latest/'>English</a>
 
 ## <a href='https://mmaction2.readthedocs.io/zh_CN/1.x/'>简体中文</a>
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
index b98358d166..51742edc72 100644
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@@ -46,7 +46,7 @@ conda install pytorch torchvision cpuonly -c pytorch
 
 ```shell
 pip install -U openmim
-mim install mmengine 'mmcv>=2.0.0rc1'
+mim install mmengine 'mmcv>=2.0.0'
 ```
 
 **第二步** 安装 MMAction2。
@@ -80,7 +80,7 @@ git checkout dev-1.x
 直接使用 pip 安装即可。
 
 ```shell
-pip install "mmaction2>=1.0rc0"
+pip install "mmaction2>=1.0.0"
 ```
 
 ## 验证安装
@@ -158,7 +158,7 @@ MMCV 包含 C++ 和 CUDA 扩展，因此其对 PyTorch 的依赖比较复杂。
 例如，以下命令安装为 PyTorch 1.10.x 和 CUDA 11.3 构建的 mmcv。
 
 ```shell
-pip install 'mmcv>=2.0.0rc1' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
+pip install 'mmcv>=2.0.0' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
 ```
 
 ### 在 CPU 环境中安装
diff --git a/docs/zh_cn/switch_language.md b/docs/zh_cn/switch_language.md
index 0009eafa9e..80cf0dc571 100644
--- a/docs/zh_cn/switch_language.md
+++ b/docs/zh_cn/switch_language.md
@@ -1,3 +1,3 @@
-## <a href='https://mmaction2.readthedocs.io/en/1.x/'>English</a>
+## <a href='https://mmaction2.readthedocs.io/en/latest/'>English</a>
 
 ## <a href='https://mmaction2.readthedocs.io/zh_CN/1.x/'>简体中文</a>
diff --git a/projects/ctrgcn/README.md b/projects/ctrgcn/README.md
index 809af449f5..b62bee8d86 100644
--- a/projects/ctrgcn/README.md
+++ b/projects/ctrgcn/README.md
@@ -20,7 +20,7 @@ Graph convolutional networks (GCNs) have been widely used and achieved remarkabl
 
 ### Setup Environment
 
-Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2.
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started.html) to install MMAction2.
 
 At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
 
diff --git a/projects/example_project/README.md b/projects/example_project/README.md
index ef74fe9cbe..30cadfb8ed 100644
--- a/projects/example_project/README.md
+++ b/projects/example_project/README.md
@@ -8,7 +8,7 @@ according to your project.
 
 ### Setup Environment
 
-Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2.
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started.html) to install MMAction2.
 
 At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
 
diff --git a/projects/msg3d/README.md b/projects/msg3d/README.md
index 7c784f90aa..a46c800acc 100644
--- a/projects/msg3d/README.md
+++ b/projects/msg3d/README.md
@@ -20,7 +20,7 @@ Spatial-temporal graphs have been widely used by skeleton-based action recogniti
 
 ### Setup Environment
 
-Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2.
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started.html) to install MMAction2.
 
 At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
 
diff --git a/tests/models/utils/test_gradcam.py b/tests/models/utils/test_gradcam.py
index 4cc8b1b8b0..5fc4173be0 100644
--- a/tests/models/utils/test_gradcam.py
+++ b/tests/models/utils/test_gradcam.py
@@ -119,7 +119,7 @@ def test_r2plus1d():
     recognizer = MODELS.build(config.model)
     recognizer.cfg = config
 
-    input_shape = (1, 3, 3, 8, 32, 32)
+    input_shape = (1, 3, 3, 8, 16, 16)
     target_layer_name = 'backbone/layer4/1/relu'
 
     _do_test_3D_models(recognizer, target_layer_name, input_shape)

From 85e3492cdea209b673911948ebe932edf841f369 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Thu, 6 Apr 2023 21:56:38 +0800
Subject: [PATCH 35/36] Bump version to 1.0.0 (#2343)

---
 .github/ISSUE_TEMPLATE/1-bug-report.yml       | 100 ++++++++++++++++++
 .github/ISSUE_TEMPLATE/2-feature-request.yml  |  33 ++++++
 .github/ISSUE_TEMPLATE/3-documentation.yml    |  23 ++++
 .github/ISSUE_TEMPLATE/config.yml             |  13 ++-
 .github/ISSUE_TEMPLATE/error-report.md        |  56 ----------
 .github/ISSUE_TEMPLATE/feature_request.md     |  33 ------
 .github/ISSUE_TEMPLATE/general_questions.md   |  14 ---
 .../reimplementation_questions.md             |  70 ------------
 README.md                                     |   9 +-
 README_zh-CN.md                               |   9 +-
 docs/en/conf.py                               |   2 +-
 docs/en/notes/changelog.md                    |  44 ++++++++
 mmaction/version.py                           |   2 +-
 13 files changed, 220 insertions(+), 188 deletions(-)
 create mode 100644 .github/ISSUE_TEMPLATE/1-bug-report.yml
 create mode 100644 .github/ISSUE_TEMPLATE/2-feature-request.yml
 create mode 100644 .github/ISSUE_TEMPLATE/3-documentation.yml
 delete mode 100644 .github/ISSUE_TEMPLATE/error-report.md
 delete mode 100644 .github/ISSUE_TEMPLATE/feature_request.md
 delete mode 100644 .github/ISSUE_TEMPLATE/general_questions.md
 delete mode 100644 .github/ISSUE_TEMPLATE/reimplementation_questions.md

diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml
new file mode 100644
index 0000000000..809a23e3c9
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -0,0 +1,100 @@
+name: "🐞 Bug Report"
+description: "Create a report to help us reproduce and fix the bug"
+labels: Bug
+title: "[Bug] "
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [here](https://github.com/open-mmlab/mmaction2/pulls)!
+        If this issue is about installing MMCV, please file an issue at [MMCV](https://github.com/open-mmlab/mmcv/issues/new/choose).
+        If you need our help, please fill in as much of the following form as you're able.
+
+        **The less clear the description, the longer it will take to solve it.**
+
+  - type: dropdown
+    id: version
+    attributes:
+      label: Branch
+      description: Which branch/version are you using?
+      options:
+        - master branch (0.x version, such as `v0.10.0`, or `dev` branch)
+        - 1.x branch (1.x version, such as `v1.0.0rc2`, or `dev-1.x` branch)
+    validations:
+      required: true
+
+  - type: checkboxes
+    attributes:
+      label: Prerequisite
+      description: Please check the following items before creating a new issue.
+      options:
+      - label: I have searched [Issues](https://github.com/open-mmlab/mmaction2/issues) and [Discussions](https://github.com/open-mmlab/mmaction2/discussions) but cannot get the expected help.
+        required: true
+      - label: I have read the [documentation](https://mmaction2.readthedocs.io/en/latest/) but cannot get the expected help.
+        required: true
+      - label: The bug has not been fixed in the [latest version](https://github.com/open-mmlab/mmaction2).
+        required: true
+
+  - type: textarea
+    attributes:
+      label: Environment
+      description: |
+        Please run `python mmaction2/utils/collect_env.py` to collect necessary environment information and copy-paste it here.
+        You may add additional information that may be helpful for locating the problem, such as
+          - How you installed PyTorch \[e.g., pip, conda, source\]
+          - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
+    validations:
+      required: true
+
+  - type: textarea
+    id: description
+    validations:
+      required: true
+    attributes:
+      label: Describe the bug
+      description: |
+        Please provide a clear and concise description of what the bug is.
+        Preferably a simple and minimal code snippet is provided below, so that we can reproduce the error by running the code.
+      placeholder: |
+        A clear and concise description of what the bug is.
+
+  - type: textarea
+    attributes:
+      label: Reproduces the problem - code sample
+      description: |
+        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
+        Did you make any modifications on the code or config? Are you clear about what you have modified?
+      placeholder: |
+        ```python
+        # Sample code to reproduce the problem
+        ```
+
+  - type: textarea
+    attributes:
+      label: Reproduces the problem - command or script
+      description: |
+        What command or script did you run?
+      placeholder: |
+        ```shell
+        The command or script you run.
+        ```
+
+  - type: textarea
+    attributes:
+      label: Reproduces the problem - error message
+      description: |
+        Please provide the error message or logs you got, with the full traceback.
+      placeholder: |
+        ```
+        The error message or logs you got, with the full traceback.
+        ```
+
+  - type: textarea
+    attributes:
+      label: Additional information
+      description: Tell us anything else you think we should know.
+      placeholder: |
+        1. What's your expected result?
+        2. What dataset did you use?
+        3. What do you think might be the reason?
diff --git a/.github/ISSUE_TEMPLATE/2-feature-request.yml b/.github/ISSUE_TEMPLATE/2-feature-request.yml
new file mode 100644
index 0000000000..c32c477133
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/2-feature-request.yml
@@ -0,0 +1,33 @@
+name: 🚀 Feature Request
+description: Suggest an idea for this project
+labels: [Feature]
+title: "[Feature] "
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        We strongly appreciate you creating a PR to implete this feature [here](https://github.com/open-mmlab/mmaction2/pulls)!
+        If you need our help, please fill in as much of the following form as you're able.
+
+        **The less clear the description, the longer it will take to solve it.**
+
+  - type: textarea
+    attributes:
+      label: What is the problem this feature will solve?
+      placeholder: |
+        E.g., It is inconvenient when \[....\].
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: What is the feature?
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: What alternatives have you considered?
+      description: |
+        Add any other context or screenshots about the feature request here.
diff --git a/.github/ISSUE_TEMPLATE/3-documentation.yml b/.github/ISSUE_TEMPLATE/3-documentation.yml
new file mode 100644
index 0000000000..f47353edd4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/3-documentation.yml
@@ -0,0 +1,23 @@
+name: 📚 Documentation Issue
+description: Report an issue related to https://mmaction2.readthedocs.io/en/latest/.
+labels: "Documentation"
+title: "[Docs] "
+
+body:
+- type: textarea
+  attributes:
+    label: The doc issue
+    description: >
+      A clear and concise description of what content in https://mmaction2.readthedocs.io/en/latest/ is an issue.
+  validations:
+    required: true
+
+- type: textarea
+  attributes:
+    label: Suggest a potential alternative/fix
+    description: >
+      Tell us how we could improve the documentation in this regard.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index a772220430..d41e7bd45f 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,9 +1,12 @@
 blank_issues_enabled: false
 
 contact_links:
-  - name: Common Issues
-    url: https://mmaction2.readthedocs.io/en/latest/faq.html
-    about: Check if your issue already has solutions
-  - name: MMAction2 Documentation
-    url: https://mmaction2.readthedocs.io/en/latest/
+  - name: 📚 MMAction2 Documentation (官方文档)
+    url: https://mmaction2.readthedocs.io/en/latest
     about: Check if your question is answered in docs
+  - name: 💬 Forum (寻求帮助)
+    url: https://github.com/open-mmlab/mmaction2/discussions
+    about: Ask general usage questions and discuss with other MMAction2 community members
+  - name: 🌐 Explore OpenMMLab (官网)
+    url: https://openmmlab.com/
+    about: Get know more about OpenMMLab
diff --git a/.github/ISSUE_TEMPLATE/error-report.md b/.github/ISSUE_TEMPLATE/error-report.md
deleted file mode 100644
index 60206eaba2..0000000000
--- a/.github/ISSUE_TEMPLATE/error-report.md
+++ /dev/null
@@ -1,56 +0,0 @@
----
-name: Error report
-about: Create a report to help us improve
-title: ''
-labels: ''
-assignees: ''
----
-
-Thanks for your error report and we appreciate it a lot.
-If you feel we have helped you, give us a STAR! :satisfied:
-
-**Checklist**
-
-1. I have searched related issues but cannot get the expected help.
-2. The bug has not been fixed in the latest version.
-
-**Describe the bug**
-
-A clear and concise description of what the bug is.
-
-**Reproduction**
-
-- What command or script did you run?
-
-```
-A placeholder for the command.
-```
-
-- What config did you run?
-
-```
-A placeholder for the config.
-```
-
-- Did you make any modifications on the code or config? Did you understand what you have modified?
-- What dataset did you use?
-
-**Environment**
-
-1. Please run `PYTHONPATH=${PWD}:$PYTHONPATH python mmaction/utils/collect_env.py` to collect necessary environment information and paste it here.
-2. You may add addition that may be helpful for locating the problem, such as
-
-- How you installed PyTorch \[e.g., pip, conda, source\]
-- Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
-
-**Error traceback**
-
-If applicable, paste the error traceback here.
-
-```
-A placeholder for traceback.
-```
-
-**Bug fix**
-
-If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
deleted file mode 100644
index 81ce7f60be..0000000000
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ /dev/null
@@ -1,33 +0,0 @@
----
-name: Feature request
-about: Suggest an idea for this project
-title: ''
-labels: ''
-assignees: ''
----
-
-Thanks for your feature request and we will review and plan for it when necessary.
-If you feel we have helped you, give us a STAR! :satisfied:
-
-**Steps**
-
-1. Check if the feature has been requested in the [meta issue](https://github.com/open-mmlab/mmaction2/issues/19), and if so, click thumb up button.
-2. Post the feature request in the [meta issue](https://github.com/open-mmlab/mmaction2/issues/19), if it is new.
-
-**Describe the feature**
-
-**Motivation**
-
-A clear and concise description of the motivation of the feature.
-
-1. Ex1. It is inconvenient when \[....\].
-2. Ex2. There is a recent paper \[....\], which is very helpful for \[....\].
-
-**Related resources**
-
-If there is an official code released or third-party implementations, please also provide the information here, which would be very helpful.
-
-**Additional context**
-
-Add any other context or screenshots about the feature request here.
-If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated.
diff --git a/.github/ISSUE_TEMPLATE/general_questions.md b/.github/ISSUE_TEMPLATE/general_questions.md
deleted file mode 100644
index 5aa583cb1c..0000000000
--- a/.github/ISSUE_TEMPLATE/general_questions.md
+++ /dev/null
@@ -1,14 +0,0 @@
----
-name: General questions
-about: Ask general questions to get help
-title: ''
-labels: ''
-assignees: ''
----
-
-Before raising a question, you may need to check the following listed items.
-
-**Checklist**
-
-1. I have searched related issues but cannot get the expected help.
-2. I have read the [FAQ documentation](https://mmaction2.readthedocs.io/en/latest/faq.html) but cannot get the expected help.
diff --git a/.github/ISSUE_TEMPLATE/reimplementation_questions.md b/.github/ISSUE_TEMPLATE/reimplementation_questions.md
deleted file mode 100644
index 683e5d6fa4..0000000000
--- a/.github/ISSUE_TEMPLATE/reimplementation_questions.md
+++ /dev/null
@@ -1,70 +0,0 @@
----
-name: Reimplementation Questions
-about: Ask about questions during model reimplementation
-title: ''
-labels: reimplementation
-assignees: ''
----
-
-If you feel we have helped you, give us a STAR! :satisfied:
-
-**Notice**
-
-There are several common situations in the reimplementation issues as below
-
-1. Reimplement a model in the model zoo using the provided configs.
-2. Reimplement a model in the model zoo on other dataset (e.g., custom datasets).
-3. Reimplement a custom model but all the components are implemented in MMAction2.
-4. Reimplement a custom model with new modules implemented by yourself.
-
-There are several things to do for different cases as below.
-
-- For case 1 & 3, please follow the steps in the following sections thus we could help to quick identify the issue.
-- For case 2 & 4, please understand that we are not able to do much help here because we usually do not know the full code and the users should be responsible to the code they write.
-- One suggestion for case 2 & 4 is that the users should first check whether the bug lies in the self-implemented code or the original code. For example, users can first make sure that the same model runs well on supported datasets. If you still need help, please describe what you have done and what you obtain in the issue, and follow the steps in the following sections and try as clear as possible so that we can better help you.
-
-**Checklist**
-
-1. I have searched related issues but cannot get the expected help.
-2. The issue has not been fixed in the latest version.
-
-**Describe the issue**
-
-A clear and concise description of what the problem you meet and what have you done.
-
-**Reproduction**
-
-- What command or script did you run?
-
-```
-A placeholder for the command.
-```
-
-- What config dir you run?
-
-```
-A placeholder for the config.
-```
-
-- Did you make any modifications on the code or config? Did you understand what you have modified?
-- What dataset did you use?
-
-**Environment**
-
-1. Please run `PYTHONPATH=${PWD}:$PYTHONPATH python mmaction/utils/collect_env.py` to collect necessary environment information and paste it here.
-2. You may add addition that may be helpful for locating the problem, such as
-
-- How you installed PyTorch \[e.g., pip, conda, source\]
-- Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
-
-**Results**
-
-If applicable, paste the related results here, e.g., what you expect and what you get.
-
-```
-A placeholder for results comparison
-```
-
-**Issue fix**
-
-If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!
diff --git a/README.md b/README.md
index f01bf8cd5f..064d4526f5 100644
--- a/README.md
+++ b/README.md
@@ -75,11 +75,12 @@ English | [简体中文](/README_zh-CN.md)
 
 **The default branch has been switched to `1.x` from `master`, and we encourage users to migrate to the latest version with more supported models, stronger pre-training checkpoints and simpler coding. Please refer to [Migration Guide](https://mmaction2.readthedocs.io/en/latest/migration.html) for more details.**
 
-**Release (2023.02.10)**: v1.0.0rc3 with the following new features:
+**Release (2023.04.06)**: v1.0.0 with the following new features:
 
-- Support Action Recognition model UniFormer V1(ICLR'2022), UniFormer V2(Arxiv'2022).
-- Support training MViT V2(CVPR'2022), and MaskFeat(CVPR'2022) fine-tuning.
-- Add a new handy interface for inference MMAction2 models ([demo](https://github.com/open-mmlab/mmaction2/blob/dev-1.x/demo/README.md#inferencer))
+- Support RGB-PoseC3D(CVPR'2022).
+- Support training UniFormer V2(Arxiv'2022).
+- Support MSG3D(CVPR'2020) and CTRGCN(CVPR'2021) in projects.
+- Refactor and provide more user-friendly documentation.
 
 ## 📖 Introduction [🔝](#-table-of-contents)
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 5b03609b9d..493d2da15e 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -91,11 +91,12 @@ MMAction2 是一款基于 PyTorch 的视频理解开源工具箱，是 [OpenMMLa
 
 ## 更新记录
 
-**v1.0.0rc3 版本 (2023.02.10)**:
+**v1.0.0 版本 (2023.04.06)**:
 
-- 支持动作识别模型 UniFormer V1（ICLR'2022），UniFormer V2（Arxiv'2022）
-- 支持训练 MViT V2（CVPR'2022）和 MaskFeat（CVPR'2022）微调
-- 为 MMAction2 模型提供统一的推理接口实现视频分析任务的快速预测 ([demo](https://github.com/open-mmlab/mmaction2/blob/dev-1.x/demo/README.md#inferencer))
+- 支持骨骼动作识别模型 RGB-PoseC3D (CVPR'2022) .
+- 在 Projects 中支持 MSG3D(CVPR'2020) 和 CTRGCN(CVPR'2021).
+- 支持训练 UniFormer V2(Arxiv'2022).
+- 重构升级用户文档
 
 ## 安装
 
diff --git a/docs/en/conf.py b/docs/en/conf.py
index ba54a05953..6623d99b45 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -17,7 +17,7 @@
 
 import pytorch_sphinx_theme
 
-sys.path.insert(0, os.path.abspath('..'))
+sys.path.insert(0, os.path.abspath('../..'))
 
 # -- Project information -----------------------------------------------------
 
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
index bbf5b9ffbc..b4d785bc8a 100644
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@@ -1,5 +1,49 @@
 # Changelog
 
+## 1.0.0 (4/6/2023)
+
+**Highlights**
+
+- Support RGB-PoseC3D(CVPR'2022).
+- Support training UniFormer V2(Arxiv'2022).
+- Support MSG3D(CVPR'2020) and CTRGCN(CVPR'2021) in projects.
+- Refactor and provide more user-friendly documentation.
+
+**New Features**
+
+- Support RGB-PoseC3D ([2182](https://github.com/open-mmlab/mmaction2/pull/2182))
+- Support training UniFormer V2 ([2221](https://github.com/open-mmlab/mmaction2/pull/2221))
+- Support MSG3D and CTRGCN in projects. ([2269](https://github.com/open-mmlab/mmaction2/pull/2269), [2291](https://github.com/open-mmlab/mmaction2/pull/2291))
+
+**Improvements**
+
+- Use MMEngine to calculate FLOPs ([2300](https://github.com/open-mmlab/mmaction2/pull/2300))
+- Speed up LFB training ([2294](https://github.com/open-mmlab/mmaction2/pull/2294))
+- Support multiprocessing on AVA evaluation ([2146](https://github.com/open-mmlab/mmaction2/pull/2146))
+- Add a demo for exporting spatial-temporal detection model to ONNX ([2225](https://github.com/open-mmlab/mmaction2/pull/2225))
+- Update spatial-temporal detection related folders ([2262](https://github.com/open-mmlab/mmaction2/pull/2262))
+
+**Bug Fixes**
+
+- Fix flip config of TSM for sth v1/v2 dataset ([#2247](https://github.com/open-mmlab/mmaction2/pull/2247))
+- Fix circle ci ([2336](https://github.com/open-mmlab/mmaction2/pull/2336), [2334](https://github.com/open-mmlab/mmaction2/pull/2334))
+- Fix accepting an unexpected argument local-rank in PyTorch 2.0 ([2320](https://github.com/open-mmlab/mmaction2/pull/2320))
+- Fix TSM config link ([2315](https://github.com/open-mmlab/mmaction2/pull/2315))
+- Fix numpy version requirement in CI ([2284](https://github.com/open-mmlab/mmaction2/pull/2284))
+- Fix NTU pose extraction script ([2246](https://github.com/open-mmlab/mmaction2/pull/2246))
+- Fix TSM-MobileNet V2 ([2332](https://github.com/open-mmlab/mmaction2/pull/2332))
+- Fix command bugs in localization tasks' README ([2244](https://github.com/open-mmlab/mmaction2/pull/2244))
+- Fix duplicate name in DecordInit and SampleAVAFrame ([2251](https://github.com/open-mmlab/mmaction2/pull/2251))
+- Fix channel order when showing video ([2308](https://github.com/open-mmlab/mmaction2/pull/2308))
+- Specify map_location to cpu when using \_load_checkpoint ([2252](https://github.com/open-mmlab/mmaction2/pull/2254))
+
+**Documentation**
+
+- Refactor and provide more user-friendly documentation ([2341](https://github.com/open-mmlab/mmaction2/pull/2341), [2312](https://github.com/open-mmlab/mmaction2/pull/2312), [2325](https://github.com/open-mmlab/mmaction2/pull/2325))
+- Add README_zh-CN ([2252](https://github.com/open-mmlab/mmaction2/pull/2252))
+- Add social networking links ([2294](https://github.com/open-mmlab/mmaction2/pull/2294))
+- Fix sthv2 dataset annotations preparation document ([2248](https://github.com/open-mmlab/mmaction2/pull/2248))
+
 ## 1.0.0rc3 (2/10/2023)
 
 **Highlights**
diff --git a/mmaction/version.py b/mmaction/version.py
index 5a0a756926..76d189b4d2 100644
--- a/mmaction/version.py
+++ b/mmaction/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
 
-__version__ = '1.0.0rc3'
+__version__ = '1.0.0'
 
 
 def parse_version_info(version_str: str):

From 7b77f29aab133f2e7c5dbc52129f1875a5263247 Mon Sep 17 00:00:00 2001
From: cir7 <33249023+cir7@users.noreply.github.com>
Date: Fri, 7 Apr 2023 10:29:12 +0800
Subject: [PATCH 36/36] [CI] fix circle CI (#2351)

---
 .circleci/test.yml                            | 36 ++++++++-----------
 .../datasets/transforms/pose_transforms.py    |  2 +-
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/.circleci/test.yml b/.circleci/test.yml
index 8ead3de0a8..efa9342303 100644
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@@ -98,10 +98,10 @@ jobs:
         type: string
       cuda:
         type: enum
-        enum: ["10.1", "10.2", "11.1"]
+        enum: ["11.0"]
       cudnn:
         type: integer
-        default: 7
+        default: 8
     machine:
       image: ubuntu-2004-cuda-11.4:202110-01
       # docker_layer_caching: true
@@ -114,9 +114,15 @@ jobs:
             docker build .circleci/docker -t mmaction:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
             docker run --gpus all -t -d -v /home/circleci/project:/mmaction -w /mmaction --name mmaction mmaction:gpu
             docker exec mmaction apt-get update
+            docker exec mmaction pip install "numpy==1.23"
             docker exec mmaction apt-get upgrade -y
             docker exec mmaction apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libturbojpeg pkg-config
             docker exec mmaction apt-get install -y libavdevice-dev libavfilter-dev libopus-dev libvpx-dev libsrtp2-dev libsndfile1
+      - run:
+          name: Install PytorchVideo and timm
+          command: |
+            docker exec mmaction pip install timm
+            docker exec mmaction python -m pip install pytorchvideo
       - run:
           name: Install mmaction dependencies
           command: |
@@ -126,21 +132,6 @@ jobs:
             docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmdetection.git@dev-3.x
             docker exec mmaction pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
             docker exec mmaction pip install -r requirements.txt
-      - when:
-          condition:
-            equal: [ "1.8.1", << parameters.torch >> ]
-          steps:
-            - run: docker exec mmaction pip install timm
-      - when:
-          condition:
-            equal: [ "1.6.0", << parameters.torch >> ]
-          steps:
-            - run: docker exec mmaction pip install timm==0.6.7
-      - when:
-          condition:
-            equal: [ "10.2", << parameters.cuda >> ]
-          steps:
-            - run: docker exec mmaction python -m pip install pytorchvideo
       - run:
           name: Build and install
           command: |
@@ -159,7 +150,7 @@ workflows:
             branches:
               ignore:
                 - dev-1.x
-                - 1.x
+                - main
   pr_stage_test:
     when:
       not:
@@ -171,7 +162,7 @@ workflows:
             branches:
               ignore:
                 - dev-1.x
-                - 1.x
+                - main
       - build_cpu:
           name: minimum_version_cpu
           torch: 1.6.0
@@ -195,7 +186,7 @@ workflows:
           torch: 1.8.1
           # Use double quotation mark to explicitly specify its type
           # as string instead of number
-          cuda: "10.2"
+          cuda: "11.0"
           requires:
             - hold
   merge_stage_test:
@@ -205,11 +196,12 @@ workflows:
     jobs:
       - build_cuda:
           name: minimum_version_gpu
-          torch: 1.6.0
+          torch: 1.7.1
           # Use double quotation mark to explicitly specify its type
           # as string instead of number
-          cuda: "10.1"
+          cuda: "11.0"
           filters:
             branches:
               only:
                 - dev-1.x
+                - main
diff --git a/mmaction/datasets/transforms/pose_transforms.py b/mmaction/datasets/transforms/pose_transforms.py
index 76e09dacd8..0abb987551 100644
--- a/mmaction/datasets/transforms/pose_transforms.py
+++ b/mmaction/datasets/transforms/pose_transforms.py
@@ -1348,7 +1348,7 @@ def transform(self, results: Dict) -> Dict:
             else:
                 inds = self._get_train_clips(num_frames, clip_len)
             inds = np.mod(inds, num_frames)
-            results[f'{modality}_inds'] = inds.astype(np.int)
+            results[f'{modality}_inds'] = inds.astype(np.int32)
             modalities.append(modality)
         results['clip_len'] = self.clip_len
         results['frame_interval'] = None