From 991356badd8c217bb82b161275b907316881b35d Mon Sep 17 00:00:00 2001 From: Kai Hu Date: Tue, 14 Feb 2023 21:56:20 -0500 Subject: [PATCH 01/36] [Doc] Add a demo for spatial-temporal detection PyTorch to ONNX (#2225) --- demo/README.md | 69 +++++ demo/demo_spatiotemporal_det_onnx.py | 382 ++++++++++++++++++++++++++ tools/deployment/export_onnx_stdet.py | 4 +- 3 files changed, 453 insertions(+), 2 deletions(-) create mode 100644 demo/demo_spatiotemporal_det_onnx.py diff --git a/demo/README.md b/demo/README.md index f3f4ba1db9..3252da35be 100644 --- a/demo/README.md +++ b/demo/README.md @@ -309,6 +309,75 @@ python demo/demo_spatiotemporal_det.py demo/demo.mp4 demo/demo_spatiotemporal_de --output-fps 6 ``` +## SpatioTemporal Action Detection ONNX Video Demo + +MMAction2 provides a demo script to predict the SpatioTemporal Action Detection result using the onnx file instead of building the PyTorch models. + +```shell +python demo/demo_spatiotemporal_det_onnx.py --video ${VIDEO_FILE} \ + [--out-filename ${OUTPUT_FILENAME}] \ + [--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \ + [--onnx-file ${SPATIOTEMPORAL_ACTION_DETECTION_ONNX_FILE}] \ + [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \ + [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \ + [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \ + [--det-cat-id ${HUMAN_DETECTION_CATEGORY_ID}] \ + [--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \ + [--label-map ${LABEL_MAP}] \ + [--device ${DEVICE}] \ + [--short-side] ${SHORT_SIDE} \ + [--predict-stepsize ${PREDICT_STEPSIZE}] \ + [--output-stepsize ${OUTPUT_STEPSIZE}] \ + [--output-fps ${OUTPUT_FPS}] +``` + +Optional arguments: + +- `OUTPUT_FILENAME`: Path to the output file which is a video format. Defaults to `demo/stdet_demo.mp4`. +- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path. +- `SPATIOTEMPORAL_ACTION_DETECTION_ONNX_FILE`: The spatiotemporal action detection onnx file. +- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path. +- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL. +- `HUMAN_DETECTION_SCORE_THRESHOLD`: The score threshold for human detection. Defaults to 0.9. +- `HUMAN_DETECTION_CATEGORY_ID`: The category id for human detection. Defaults to 0. +- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Defaults to 0.5. +- `LABEL_MAP`: The label map used. Defaults to `tools/data/ava/label_map.txt`. +- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Defaults to `cuda:0`. +- `SHORT_SIDE`: The short side used for frame extraction. Defaults to 256. +- `PREDICT_STEPSIZE`: Make a prediction per N frames. Defaults to 8. +- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Defaults to 4. +- `OUTPUT_FPS`: The FPS of demo video output. Defaults to 6. + +Examples: + +Assume that you are located at `$MMACTION2` . + +1. Export an onnx file given the config file and checkpoint. + +```shell +python3 tools/deployment/export_onnx_stdet.py \ + configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \ + https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \ + --output_file slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx \ + --num_frames 8 +``` + +2. Use the Faster RCNN as the human detector, the generated `slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx` file as the action detector. Making predictions per 8 frames, and output 1 frame per 4 frames to the output video. The FPS of the output video is 4. + +```shell +python demo/demo_spatiotemporal_det_onnx.py demo/demo.mp4 demo/demo_spatiotemporal_det.mp4 \ + --config configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \ + --onnx-file slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx \ + --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \ + --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \ + --det-score-thr 0.9 \ + --action-score-thr 0.5 \ + --label-map tools/data/ava/label_map.txt \ + --predict-stepsize 8 \ + --output-stepsize 4 \ + --output-fps 6 +``` + ## Inferencer MMAction2 provides a demo script to implement fast prediction for video analysis tasks based on unified inferencer interface, currently only supports action recognition task. diff --git a/demo/demo_spatiotemporal_det_onnx.py b/demo/demo_spatiotemporal_det_onnx.py new file mode 100644 index 0000000000..d1ee9f0edc --- /dev/null +++ b/demo/demo_spatiotemporal_det_onnx.py @@ -0,0 +1,382 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import copy as cp +import os +import os.path as osp +import shutil + +import cv2 +import mmcv +import mmengine +import numpy as np +import onnxruntime +import torch +from mmdet.structures.bbox import bbox2roi +from mmengine import DictAction + +from mmaction.apis import detection_inference + +try: + import moviepy.editor as mpy +except ImportError: + raise ImportError('Please install moviepy to enable output file') + +FONTFACE = cv2.FONT_HERSHEY_DUPLEX +FONTSCALE = 0.5 +FONTCOLOR = (255, 255, 255) # BGR, white +MSGCOLOR = (128, 128, 128) # BGR, gray +THICKNESS = 1 +LINETYPE = 1 + + +def hex2color(h): + """Convert the 6-digit hex string to tuple of 3 int value (RGB)""" + return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16)) + + +plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4' +plate_blue = plate_blue.split('-') +plate_blue = [hex2color(h) for h in plate_blue] +plate_green = '004b23-006400-007200-008000-38b000-70e000' +plate_green = plate_green.split('-') +plate_green = [hex2color(h) for h in plate_green] + + +def visualize(frames, annotations, plate=plate_blue, max_num=5): + """Visualize frames with predicted annotations. + + Args: + frames (list[np.ndarray]): Frames for visualization, note that + len(frames) % len(annotations) should be 0. + annotations (list[list[tuple]]): The predicted results. + plate (str): The plate used for visualization. Default: plate_blue. + max_num (int): Max number of labels to visualize for a person box. + Default: 5. + Returns: + list[np.ndarray]: Visualized frames. + """ + + assert max_num + 1 <= len(plate) + plate = [x[::-1] for x in plate] + frames_out = cp.deepcopy(frames) + nf, na = len(frames), len(annotations) + assert nf % na == 0 + nfpa = len(frames) // len(annotations) + anno = None + h, w, _ = frames[0].shape + scale_ratio = np.array([w, h, w, h]) + for i in range(na): + anno = annotations[i] + if anno is None: + continue + for j in range(nfpa): + ind = i * nfpa + j + frame = frames_out[ind] + for ann in anno: + box = ann[0] + label = ann[1] + if not len(label): + continue + score = ann[2] + box = (box * scale_ratio).astype(np.int64) + st, ed = tuple(box[:2]), tuple(box[2:]) + cv2.rectangle(frame, st, ed, plate[0], 2) + for k, lb in enumerate(label): + if k >= max_num: + break + text = abbrev(lb) + text = ': '.join([text, str(score[k])]) + location = (0 + st[0], 18 + k * 18 + st[1]) + textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE, + THICKNESS)[0] + textwidth = textsize[0] + diag0 = (location[0] + textwidth, location[1] - 14) + diag1 = (location[0], location[1] + 2) + cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1) + cv2.putText(frame, text, location, FONTFACE, FONTSCALE, + FONTCOLOR, THICKNESS, LINETYPE) + + return frames_out + + +def frame_extraction(video_path): + """Extract frames given video_path. + + Args: + video_path (str): The video_path. + """ + # Load the video, extract frames into ./tmp/video_name + target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0])) + os.makedirs(target_dir, exist_ok=True) + # Should be able to handle videos up to several hours + frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg') + vid = cv2.VideoCapture(video_path) + frames = [] + frame_paths = [] + flag, frame = vid.read() + cnt = 0 + while flag: + frames.append(frame) + frame_path = frame_tmpl.format(cnt + 1) + frame_paths.append(frame_path) + cv2.imwrite(frame_path, frame) + cnt += 1 + flag, frame = vid.read() + return frame_paths, frames + + +def load_label_map(file_path): + """Load Label Map. + + Args: + file_path (str): The file path of label map. + Returns: + dict: The label map (int -> label name). + """ + lines = open(file_path).readlines() + lines = [x.strip().split(': ') for x in lines] + return {int(x[0]): x[1] for x in lines} + + +def abbrev(name): + """Get the abbreviation of label name: + + 'take (an object) from (a person)' -> 'take ... from ...' + """ + while name.find('(') != -1: + st, ed = name.find('('), name.find(')') + name = name[:st] + '...' + name[ed + 1:] + return name + + +def pack_result(human_detection, result, img_h, img_w): + """Short summary. + + Args: + human_detection (np.ndarray): Human detection result. + result (type): The predicted label of each human proposal. + img_h (int): The image height. + img_w (int): The image width. + Returns: + tuple: Tuple of human proposal, label name and label score. + """ + human_detection[:, 0::2] /= img_w + human_detection[:, 1::2] /= img_h + results = [] + if result is None: + return None + for prop, res in zip(human_detection, result): + res.sort(key=lambda x: -x[1]) + results.append( + (prop.data.cpu().numpy(), [x[0] for x in res], [x[1] + for x in res])) + return results + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMAction2 demo') + parser.add_argument('video', help='video file/url') + parser.add_argument('out_filename', help='output filename') + parser.add_argument( + '--config', + default=('configs/detection/ava_kinetics/slowonly_k700-pre-' + 'r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'), + help='spatialtemporal detection model config file path') + parser.add_argument( + '--onnx-file', help='spatialtemporal detection onnx file path') + + parser.add_argument( + '--det-config', + default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py', + help='human detection config file path (from mmdet)') + parser.add_argument( + '--det-checkpoint', + default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/' + 'faster_rcnn_r50_fpn_2x_coco/' + 'faster_rcnn_r50_fpn_2x_coco_' + 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'), + help='human detection checkpoint file/url') + parser.add_argument( + '--det-score-thr', + type=float, + default=0.9, + help='the threshold of human detection score') + parser.add_argument( + '--det-cat-id', + type=int, + default=0, + help='the category id for human detection') + parser.add_argument( + '--action-score-thr', + type=float, + default=0.5, + help='the threshold of human action score') + parser.add_argument( + '--label-map', + default='tools/data/ava/label_map.txt', + help='label map file') + parser.add_argument( + '--device', type=str, default='cuda:0', help='CPU/CUDA device option') + parser.add_argument( + '--short-side', + type=int, + default=256, + help='specify the short-side length of the image') + parser.add_argument( + '--predict-stepsize', + default=8, + type=int, + help='give out a prediction per n frames') + parser.add_argument( + '--output-stepsize', + default=4, + type=int, + help=('show one frame per n frames in the demo, we should have: ' + 'predict_stepsize % output_stepsize == 0')) + parser.add_argument( + '--output-fps', + default=6, + type=int, + help='the fps of demo video output') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + frame_paths, original_frames = frame_extraction(args.video) + num_frame = len(frame_paths) + h, w, _ = original_frames[0].shape + + # resize frames to shortside + new_w, new_h = mmcv.rescale_size((w, h), (args.short_side, np.Inf)) + frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames] + w_ratio, h_ratio = new_w / w, new_h / h + + # Get clip_len, frame_interval and calculate center index of each clip + config = mmengine.Config.fromfile(args.config) + config.merge_from_dict(args.cfg_options) + val_pipeline = config.val_pipeline + + sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0] + clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval'] + window_size = clip_len * frame_interval + assert clip_len % 2 == 0, 'We would like to have an even clip_len' + # Note that it's 1 based here + timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2, + args.predict_stepsize) + + # Load label_map + label_map = load_label_map(args.label_map) + try: + if config['data']['train']['custom_classes'] is not None: + label_map = { + id + 1: label_map[cls] + for id, cls in enumerate(config['data']['train'] + ['custom_classes']) + } + except KeyError: + pass + + # Get Human detection results + center_frames = [frame_paths[ind - 1] for ind in timestamps] + + human_detections, _ = detection_inference(args.det_config, + args.det_checkpoint, + center_frames, + args.det_score_thr, + args.det_cat_id, args.device) + torch.cuda.empty_cache() + for i in range(len(human_detections)): + det = human_detections[i] + det[:, 0:4:2] *= w_ratio + det[:, 1:4:2] *= h_ratio + human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device) + + # Build STDET model + session = onnxruntime.InferenceSession(args.onnx_file) + + predictions = [] + + img_norm_cfg = dict( + mean=np.array(config.model.data_preprocessor.mean), + std=np.array(config.model.data_preprocessor.std), + to_rgb=False) + + print('Performing SpatioTemporal Action Detection for each clip') + assert len(timestamps) == len(human_detections) + prog_bar = mmengine.ProgressBar(len(timestamps)) + for timestamp, proposal in zip(timestamps, human_detections): + if proposal.shape[0] == 0: + predictions.append(None) + continue + + start_frame = timestamp - (clip_len // 2 - 1) * frame_interval + frame_inds = start_frame + np.arange(0, window_size, frame_interval) + frame_inds = list(frame_inds - 1) + imgs = [frames[ind].astype(np.float32) for ind in frame_inds] + _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs] + # THWC -> CTHW -> 1CTHW + input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis] + rois = bbox2roi([proposal]) + + input_feed = { + 'input_tensor': input_array, + 'rois': rois.cpu().data.numpy() + } + outputs = session.run(['cls_score'], input_feed=input_feed) + logits = outputs[0] + scores = 1 / (1 + np.exp(-logits)) + + prediction = [] + # N proposals + for i in range(proposal.shape[0]): + prediction.append([]) + # Perform action score thr + for i in range(scores.shape[1]): + if i not in label_map: + continue + for j in range(proposal.shape[0]): + if scores[j, i] > args.action_score_thr: + prediction[j].append((label_map[i], scores[j, i].item())) + predictions.append(prediction) + prog_bar.update() + + results = [] + for human_detection, prediction in zip(human_detections, predictions): + results.append(pack_result(human_detection, prediction, new_h, new_w)) + + def dense_timestamps(timestamps, n): + """Make it nx frames.""" + old_frame_interval = (timestamps[1] - timestamps[0]) + start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 + new_frame_inds = np.arange( + len(timestamps) * n) * old_frame_interval / n + start + return new_frame_inds.astype(np.int) + + dense_n = int(args.predict_stepsize / args.output_stepsize) + frames = [ + cv2.imread(frame_paths[i - 1]) + for i in dense_timestamps(timestamps, dense_n) + ] + print('Performing visualization') + vis_frames = visualize(frames, results) + vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames], + fps=args.output_fps) + vid.write_videofile(args.out_filename) + + tmp_frame_dir = osp.dirname(frame_paths[0]) + shutil.rmtree(tmp_frame_dir) + + +if __name__ == '__main__': + main() diff --git a/tools/deployment/export_onnx_stdet.py b/tools/deployment/export_onnx_stdet.py index fc587dbff0..ba0cd2e388 100644 --- a/tools/deployment/export_onnx_stdet.py +++ b/tools/deployment/export_onnx_stdet.py @@ -155,9 +155,9 @@ def main(): args.output_file, input_names=['input_tensor', 'rois'], output_names=['cls_score'], - export_params=False, + export_params=True, do_constant_folding=True, - verbose=True, + verbose=False, opset_version=11, dynamic_axes={ 'input_tensor': { From e036445f482d22ee9e607be88525e9a67bed5b84 Mon Sep 17 00:00:00 2001 From: vansin Date: Thu, 16 Feb 2023 17:50:13 +0800 Subject: [PATCH 02/36] Add twitter discord medium youtube link (#2228) --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 5e85da2fd5..ab41e0f96e 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,20 @@ +
+ + + + + + + + + + + +
+ ## Introduction MMAction2 is an open-source toolbox for video understanding based on PyTorch. From 933ba42b64cc904c110c7947b90f7a4576ab726d Mon Sep 17 00:00:00 2001 From: Kai Hu Date: Thu, 16 Feb 2023 04:57:28 -0500 Subject: [PATCH 03/36] fix bug (#2227) --- tools/data/anno_txt2json.py | 4 ++-- tools/data/build_audio_features.py | 4 ++-- tools/data/build_file_list.py | 2 +- tools/data/extract_audio.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/data/anno_txt2json.py b/tools/data/anno_txt2json.py index fcefc7778e..f5b1f9f736 100644 --- a/tools/data/anno_txt2json.py +++ b/tools/data/anno_txt2json.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse -import mmcv +import mmengine def parse_args(): @@ -100,4 +100,4 @@ def lines2dictlist(lines, format): result = lines2dictlist(lines, args.format) if args.output is None: args.output = args.annofile.replace('.txt', '.json') - mmcv.dump(result, args.output) + mmengine.dump(result, args.output) diff --git a/tools/data/build_audio_features.py b/tools/data/build_audio_features.py index 05f5978083..28356a0e64 100644 --- a/tools/data/build_audio_features.py +++ b/tools/data/build_audio_features.py @@ -6,7 +6,7 @@ import sys from multiprocessing import Pool -import mmcv +import mmengine import numpy as np from scipy.io import wavfile @@ -295,7 +295,7 @@ def extract_audio_feature(wav_path, audio_tools, mel_out_dir): parser.add_argument('--part', type=str, default='1/1') args = parser.parse_args() - mmcv.mkdir_or_exist(args.spectrogram_save_path) + mmengine.mkdir_or_exist(args.spectrogram_save_path) files = glob.glob( osp.join(args.audio_home_path, '*/' * args.level, '*' + args.ext)) diff --git a/tools/data/build_file_list.py b/tools/data/build_file_list.py index 0ba15e75d0..11a1322854 100644 --- a/tools/data/build_file_list.py +++ b/tools/data/build_file_list.py @@ -5,7 +5,7 @@ import os.path as osp import random -from mmcv.runner import set_random_seed +from mmengine.runner import set_random_seed from tools.data.anno_txt2json import lines2dictlist from tools.data.parse_file_list import (parse_directory, parse_diving48_splits, diff --git a/tools/data/extract_audio.py b/tools/data/extract_audio.py index 6f56de2691..78d95d8ea1 100644 --- a/tools/data/extract_audio.py +++ b/tools/data/extract_audio.py @@ -5,7 +5,7 @@ import os.path as osp from multiprocessing import Pool -import mmcv +import mmengine def extract_audio_wav(line): @@ -47,7 +47,7 @@ def parse_args(): if __name__ == '__main__': args = parse_args() - mmcv.mkdir_or_exist(args.dst_root) + mmengine.mkdir_or_exist(args.dst_root) print('Reading videos from folder: ', args.root) print('Extension of videos: ', args.ext) From 09cd0a72e6b89bae5590b1a4f1cfa50b9c9c38bd Mon Sep 17 00:00:00 2001 From: wxDai Date: Mon, 20 Feb 2023 15:16:40 +0800 Subject: [PATCH 04/36] update skeleton data readme (#2222) --- tools/data/skeleton/README.md | 60 +++++++++------------ tools/data/skeleton/download_annotations.sh | 22 -------- 2 files changed, 25 insertions(+), 57 deletions(-) delete mode 100644 tools/data/skeleton/download_annotations.sh diff --git a/tools/data/skeleton/README.md b/tools/data/skeleton/README.md index 25c7f62892..3ada42e8ef 100644 --- a/tools/data/skeleton/README.md +++ b/tools/data/skeleton/README.md @@ -15,48 +15,38 @@ ## Introduction -We release the skeleton annotations used in [Revisiting Skeleton-based Action Recognition](https://arxiv.org/abs/2104.13586). By default, we use [Faster-RCNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) with ResNet50 backbone for human detection and [HRNet-w32](https://github.com/open-mmlab/mmpose/blob/master/configs/top_down/hrnet/coco/hrnet_w32_coco_256x192.py) for single person pose estimation. For FineGYM, we use Ground-Truth bounding boxes for the athlete instead of detection bounding boxes. Currently, we release the skeleton annotations for FineGYM and NTURGB-D Xsub split. Other annotations will be soo released. +We release the skeleton annotations used in [Revisiting Skeleton-based Action Recognition](https://arxiv.org/abs/2104.13586). By default, we use [Faster-RCNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) with ResNet50 backbone for human detection and [HRNet-w32](https://github.com/open-mmlab/mmpose/blob/master/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py) for single person pose estimation. For FineGYM, we use Ground-Truth bounding boxes for the athlete instead of detection bounding boxes. ## Prepare Annotations -Currently, we support HMDB51, UCF101, FineGYM and NTURGB+D. For FineGYM, you can execute following scripts to prepare the annotations. +We provide links to the pre-processed skeleton annotations, you can directly download them and use them for training & testing. -```shell -bash download_annotations.sh ${DATASET} -``` - -Due to [Conditions of Use](http://rose1.ntu.edu.sg/Datasets/actionRecognition.asp) of the NTURGB+D dataset, we can not directly release the annotations used in our experiments. So that we provide a script to generate pose annotations for videos in NTURGB+D datasets, which generate a dictionary and save it as a single pickle file. You can create a list which contain all annotation dictionaries of corresponding videos and save them as a pickle file. Then you can get the `ntu60_xsub_train.pkl`, `ntu60_xsub_val.pkl`, `ntu120_xsub_train.pkl`, `ntu120_xsub_val.pkl` that we used in training. - -For those who have not enough computations for pose extraction, we provide the outputs of the above pipeline here, corresponding to 4 different splits of NTURGB+D datasets: - -- ntu60_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_train.pkl -- ntu60_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_val.pkl -- ntu120_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_train.pkl -- ntu120_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_val.pkl -- hmdb51: https://download.openmmlab.com/mmaction/posec3d/hmdb51.pkl -- ucf101: https://download.openmmlab.com/mmaction/posec3d/ucf101.pkl - -To generate 2D pose annotations for a single video, first, you need to install mmdetection and mmpose from src code. After that, you need to replace the placeholder `mmdet_root` and `mmpose_root` in `ntu_pose_extraction.py` with your installation path. Then you can use following scripts for NTURGB+D video pose extraction: - -```python -python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl -``` - -After you get pose annotations for all videos in a dataset split, like `ntu60_xsub_val`. You can gather them into a single list and save the list as `ntu60_xsub_val.pkl`. You can use those larger pickle files for training and testing. +- NTURGB+D \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_2d.pkl +- NTURGB+D \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_3d.pkl +- NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl +- NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl +- GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl + - GYM 2D skeletons are extracted with ground-truth human bounding boxes, which can be downloaded with link: https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl. Please cite [PoseConv3D](https://arxiv.org/abs/2104.13586) if you use it in your project. +- UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl +- HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl +- Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl +- Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (Table of contents only, no skeleton annotations) -## The Format of PoseC3D Annotations +For Kinetics400, since the skeleton annotations are large, we do not provide the direct download links on aliyun. Please use the following link to download the `kpfiles` and extract it under `$MMACTION2/data/k400` for Kinetics-400 training & testing: https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EeyDCVskqLtClMVVwqD53acBF2FEwkctp3vtRbkLfnKSTw?e=B3SZlM -Here we briefly introduce the format of PoseC3D Annotations, we will take `gym_train.pkl` as an example: the content of `gym_train.pkl` is a list of length 20484, each item is a dictionary that is the skeleton annotation of one video. Each dictionary has following fields: +## The Format of Annotations -- keypoint: The keypoint coordinates, which is a numpy array of the shape N (#person) x T (temporal length) x K (#keypoints, 17 in our case) x 2 (x, y coordinate). -- keypoint_score: The keypoint confidence scores, which is a numpy array of the shape N (#person) x T (temporal length) x K (#keypoints, 17 in our case). -- frame_dir: The corresponding video name. -- label: The action category. -- img_shape: The image shape of each frame. -- original_shape: Same as above. -- total_frames: The temporal length of the video. +Each pickle file corresponds to an action recognition dataset. The content of a pickle file is a dictionary with two fields: `split` and `annotations` -For training with your custom dataset, you can refer to [Custom Dataset Training](https://github.com/open-mmlab/mmaction2/blob/master/configs/skeleton/posec3d/custom_dataset_training.md). +1. Split: The value of the `split` field is a dictionary: the keys are the split names, while the values are lists of video identifiers that belong to the specific clip. +2. Annotations: The value of the `annotations` field is a list of skeleton annotations, each skeleton annotation is a dictionary, containing the following fields: + 1. `frame_dir` (str): The identifier of the corresponding video. + 2. `total_frames` (int): The number of frames in this video. + 3. `img_shape` (tuple\[int\]): The shape of a video frame, a tuple with two elements, in the format of (height, width). Only required for 2D skeletons. + 4. `original_shape` (tuple\[int\]): Same as `img_shape`. + 5. `label` (int): The action label. + 6. `keypoint` (np.ndarray, with shape \[M x T x V x C\]): The keypoint annotation. M: number of persons; T: number of frames (same as `total_frames`); V: number of keypoints (25 for NTURGB+D 3D skeleton, 17 for CoCo, 18 for OpenPose, etc. ); C: number of dimensions for keypoint coordinates (C=2 for 2D keypoint, C=3 for 3D keypoint). + 7. `keypoint_score` (np.ndarray, with shape \[M x T x V\]): The confidence score of keypoints. Only required for 2D skeletons. ## Visualization @@ -128,4 +118,4 @@ We provide scripts to convert skeleton annotations from third-party projects to - [x] NTU120_XSet - [x] UCF101 - [x] HMDB51 -- [ ] Kinetics +- [x] Kinetics diff --git a/tools/data/skeleton/download_annotations.sh b/tools/data/skeleton/download_annotations.sh deleted file mode 100644 index d57efbceac..0000000000 --- a/tools/data/skeleton/download_annotations.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -DATASET=$1 -if [ "$DATASET" == "gym" ]; then - echo "We are processing $DATASET" -else - echo "Bad Argument, we only support gym now." - exit 0 -fi - -DATA_DIR="../../../data/posec3d/" - -if [[ ! -d "${DATA_DIR}" ]]; then - echo "${DATA_DIR} does not exist. Creating"; - mkdir -p ${DATA_DIR} -fi - -wget https://download.openmmlab.com/mmaction/posec3d/${DATASET}_train.pkl -wget https://download.openmmlab.com/mmaction/posec3d/${DATASET}_val.pkl - -mv ${DATASET}_train.pkl ${DATA_DIR} -mv ${DATASET}_val.pkl ${DATA_DIR} From 14561295381ac9ad4350724ec78dcbfefe7f7f7d Mon Sep 17 00:00:00 2001 From: wxDai Date: Mon, 20 Feb 2023 15:51:58 +0800 Subject: [PATCH 05/36] fix aliases (#2241) --- demo/demo_spatiotemporal_det.py | 2 +- demo/demo_spatiotemporal_det_onnx.py | 2 +- mmaction/datasets/transforms/pose_transforms.py | 2 +- mmaction/evaluation/functional/accuracy.py | 4 ++-- mmaction/evaluation/functional/ava_evaluation/metrics.py | 4 ++-- mmaction/evaluation/functional/eval_detection.py | 4 ++-- tests/evaluation/metrics/test_metric_utils.py | 2 +- tools/data/activitynet/process_annotations.py | 6 +++--- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/demo/demo_spatiotemporal_det.py b/demo/demo_spatiotemporal_det.py index 009a9475a6..5ec42e7856 100644 --- a/demo/demo_spatiotemporal_det.py +++ b/demo/demo_spatiotemporal_det.py @@ -378,7 +378,7 @@ def dense_timestamps(timestamps, n): start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 new_frame_inds = np.arange( len(timestamps) * n) * old_frame_interval / n + start - return new_frame_inds.astype(np.int) + return new_frame_inds.astype(np.int64) dense_n = int(args.predict_stepsize / args.output_stepsize) frames = [ diff --git a/demo/demo_spatiotemporal_det_onnx.py b/demo/demo_spatiotemporal_det_onnx.py index d1ee9f0edc..6e5394e173 100644 --- a/demo/demo_spatiotemporal_det_onnx.py +++ b/demo/demo_spatiotemporal_det_onnx.py @@ -361,7 +361,7 @@ def dense_timestamps(timestamps, n): start = timestamps[0] - old_frame_interval / n * (n - 1) / 2 new_frame_inds = np.arange( len(timestamps) * n) * old_frame_interval / n + start - return new_frame_inds.astype(np.int) + return new_frame_inds.astype(np.int64) dense_n = int(args.predict_stepsize / args.output_stepsize) frames = [ diff --git a/mmaction/datasets/transforms/pose_transforms.py b/mmaction/datasets/transforms/pose_transforms.py index 1740a18575..cff9f90112 100644 --- a/mmaction/datasets/transforms/pose_transforms.py +++ b/mmaction/datasets/transforms/pose_transforms.py @@ -1157,7 +1157,7 @@ def transform(self, results: Dict) -> Dict: transitional[i] = transitional[i - 1] = True if num_persons[i] != num_persons[i + 1]: transitional[i] = transitional[i + 1] = True - inds_int = inds.astype(np.int) + inds_int = inds.astype(np.int64) coeff = np.array([transitional[i] for i in inds_int]) inds = (coeff * inds_int + (1 - coeff) * inds).astype(np.float32) diff --git a/mmaction/evaluation/functional/accuracy.py b/mmaction/evaluation/functional/accuracy.py index 4b7f6dd52a..aa28bd486b 100644 --- a/mmaction/evaluation/functional/accuracy.py +++ b/mmaction/evaluation/functional/accuracy.py @@ -166,7 +166,7 @@ def mmit_mean_average_precision(scores, labels): sample. Returns: - np.float: The MMIT style mean average precision. + np.float64: The MMIT style mean average precision. """ results = [] for score, label in zip(scores, labels): @@ -186,7 +186,7 @@ def mean_average_precision(scores, labels): sample. Returns: - np.float: The mean average precision. + np.float64: The mean average precision. """ results = [] scores = np.stack(scores).T diff --git a/mmaction/evaluation/functional/ava_evaluation/metrics.py b/mmaction/evaluation/functional/ava_evaluation/metrics.py index 4d566accb5..ffbe589454 100644 --- a/mmaction/evaluation/functional/ava_evaluation/metrics.py +++ b/mmaction/evaluation/functional/ava_evaluation/metrics.py @@ -35,7 +35,7 @@ def compute_precision_recall(scores, labels, num_gt): instances. This value is None if no ground truth labels are present. """ - if (not isinstance(labels, np.ndarray) or labels.dtype != np.bool + if (not isinstance(labels, np.ndarray) or labels.dtype != bool or len(labels.shape) != 1): raise ValueError('labels must be single dimension bool numpy array') @@ -90,7 +90,7 @@ def compute_average_precision(precision, recall): if not isinstance(precision, np.ndarray) or not isinstance( recall, np.ndarray): raise ValueError('precision and recall must be numpy array') - if precision.dtype != np.float or recall.dtype != np.float: + if precision.dtype != np.float64 or recall.dtype != np.float64: raise ValueError('input must be float numpy array.') if len(precision) != len(recall): raise ValueError('precision and recall must be of the same size.') diff --git a/mmaction/evaluation/functional/eval_detection.py b/mmaction/evaluation/functional/eval_detection.py index 2af3ada0db..b081d52b9b 100644 --- a/mmaction/evaluation/functional/eval_detection.py +++ b/mmaction/evaluation/functional/eval_detection.py @@ -220,8 +220,8 @@ def compute_average_precision_detection(ground_truth, if fp[t_idx, idx] == 0 and tp[t_idx, idx] == 0: fp[t_idx, idx] = 1 - tp_cumsum = np.cumsum(tp, axis=1).astype(np.float) - fp_cumsum = np.cumsum(fp, axis=1).astype(np.float) + tp_cumsum = np.cumsum(tp, axis=1).astype(np.float64) + fp_cumsum = np.cumsum(fp, axis=1).astype(np.float64) recall_cumsum = tp_cumsum / num_positive precision_cumsum = tp_cumsum / (tp_cumsum + fp_cumsum) diff --git a/tests/evaluation/metrics/test_metric_utils.py b/tests/evaluation/metrics/test_metric_utils.py index 091a728bc4..5eeb12e199 100644 --- a/tests/evaluation/metrics/test_metric_utils.py +++ b/tests/evaluation/metrics/test_metric_utils.py @@ -151,7 +151,7 @@ def gt_confusion_matrix(gt_labels, pred_labels, normalize=None): confusion_mat = np.delete(confusion_mat, del_index, axis=1) if normalize is not None: - confusion_mat = np.array(confusion_mat, dtype=np.float) + confusion_mat = np.array(confusion_mat, dtype=np.float64) m, n = confusion_mat.shape if normalize == 'true': for i in range(m): diff --git a/tools/data/activitynet/process_annotations.py b/tools/data/activitynet/process_annotations.py index 09ed5b5c8f..9374281a64 100644 --- a/tools/data/activitynet/process_annotations.py +++ b/tools/data/activitynet/process_annotations.py @@ -18,7 +18,7 @@ def load_json(file): anno_database = load_json(ann_file) -video_record = np.loadtxt(info_file, dtype=np.str, delimiter=',', skiprows=1) +video_record = np.loadtxt(info_file, dtype=str, delimiter=',', skiprows=1) video_dict_train = {} video_dict_val = {} @@ -29,8 +29,8 @@ def load_json(file): video_name = video_item[0] video_info = anno_database[video_name] video_subset = video_item[5] - video_info['fps'] = video_item[3].astype(np.float) - video_info['rfps'] = video_item[4].astype(np.float) + video_info['fps'] = video_item[3].astype(np.float64) + video_info['rfps'] = video_item[4].astype(np.float64) video_dict_full[video_name] = video_info if video_subset == 'training': video_dict_train[video_name] = video_info From c99ad659aeb8a860823d2536dfbe0ad18943dae5 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Mon, 20 Feb 2023 19:28:53 +0800 Subject: [PATCH 06/36] [Enhance] use tmpfile to avoid remaining tmp directory when demo break (#2236) --- demo/README.md | 1 + demo/demo_skeleton.py | 11 ++++---- demo/demo_spatiotemporal_det.py | 38 +++++----------------------- demo/demo_spatiotemporal_det_onnx.py | 38 +++++----------------------- mmaction/utils/misc.py | 24 +++++++++++------- tests/utils/test_misc.py | 22 ++++++++++++++++ 6 files changed, 56 insertions(+), 78 deletions(-) create mode 100644 tests/utils/test_misc.py diff --git a/demo/README.md b/demo/README.md index 3252da35be..447789d37d 100644 --- a/demo/README.md +++ b/demo/README.md @@ -8,6 +8,7 @@ - [Webcam demo](#webcam-demo): A demo script to implement real-time action recognition from a web camera. - [Skeleton-based Action Recognition Demo](#skeleton-based-action-recognition-demo): A demo script to predict the skeleton-based action recognition result using a single video. - [SpatioTemporal Action Detection Video Demo](#spatiotemporal-action-detection-video-demo): A demo script to predict the spatiotemporal action detection result using a single video. +- [SpatioTemporal Action Detection ONNX Video Demo](#spatiotemporal-action-detection-onnx-video-demo): A demo script to predict the SpatioTemporal Action Detection result using the onnx file instead of building the PyTorch models. - [Inferencer Demo](#inferencer): A demo script to implement fast predict for video analysis tasks based on unified inferencer interface. ## Modify configs through script arguments diff --git a/demo/demo_skeleton.py b/demo/demo_skeleton.py index 57c84c90a3..3dc1fb215a 100644 --- a/demo/demo_skeleton.py +++ b/demo/demo_skeleton.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse -import os.path as osp -import shutil +import tempfile import cv2 import mmcv @@ -128,7 +127,10 @@ def visualize(args, frames, data_samples, action_label): def main(): args = parse_args() - frame_paths, frames = frame_extract(args.video, args.short_side) + + tmp_dir = tempfile.TemporaryDirectory() + frame_paths, frames = frame_extract(args.video, args.short_side, + tmp_dir.name) num_frame = len(frame_paths) h, w, _ = frames[0].shape @@ -180,8 +182,7 @@ def main(): visualize(args, frames, pose_data_samples, action_label) - tmp_frame_dir = osp.dirname(frame_paths[0]) - shutil.rmtree(tmp_frame_dir) + tmp_dir.cleanup() if __name__ == '__main__': diff --git a/demo/demo_spatiotemporal_det.py b/demo/demo_spatiotemporal_det.py index 5ec42e7856..0c5091dab2 100644 --- a/demo/demo_spatiotemporal_det.py +++ b/demo/demo_spatiotemporal_det.py @@ -1,9 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import copy as cp -import os -import os.path as osp -import shutil +import tempfile import cv2 import mmcv @@ -17,6 +15,7 @@ from mmaction.apis import detection_inference from mmaction.registry import MODELS from mmaction.structures import ActionDataSample +from mmaction.utils import frame_extract try: import moviepy.editor as mpy @@ -101,32 +100,6 @@ def visualize(frames, annotations, plate=plate_blue, max_num=5): return frames_out -def frame_extraction(video_path): - """Extract frames given video_path. - - Args: - video_path (str): The video_path. - """ - # Load the video, extract frames into ./tmp/video_name - target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0])) - os.makedirs(target_dir, exist_ok=True) - # Should be able to handle videos up to several hours - frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg') - vid = cv2.VideoCapture(video_path) - frames = [] - frame_paths = [] - flag, frame = vid.read() - cnt = 0 - while flag: - frames.append(frame) - frame_path = frame_tmpl.format(cnt + 1) - frame_paths.append(frame_path) - cv2.imwrite(frame_path, frame) - cnt += 1 - flag, frame = vid.read() - return frame_paths, frames - - def load_label_map(file_path): """Load Label Map. @@ -259,7 +232,9 @@ def parse_args(): def main(): args = parse_args() - frame_paths, original_frames = frame_extraction(args.video) + tmp_dir = tempfile.TemporaryDirectory() + frame_paths, original_frames = frame_extract( + args.video, out_dir=tmp_dir.name) num_frame = len(frame_paths) h, w, _ = original_frames[0].shape @@ -391,8 +366,7 @@ def dense_timestamps(timestamps, n): fps=args.output_fps) vid.write_videofile(args.out_filename) - tmp_frame_dir = osp.dirname(frame_paths[0]) - shutil.rmtree(tmp_frame_dir) + tmp_dir.cleanup() if __name__ == '__main__': diff --git a/demo/demo_spatiotemporal_det_onnx.py b/demo/demo_spatiotemporal_det_onnx.py index 6e5394e173..7c40e9c64e 100644 --- a/demo/demo_spatiotemporal_det_onnx.py +++ b/demo/demo_spatiotemporal_det_onnx.py @@ -1,9 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse import copy as cp -import os -import os.path as osp -import shutil +import tempfile import cv2 import mmcv @@ -15,6 +13,7 @@ from mmengine import DictAction from mmaction.apis import detection_inference +from mmaction.utils import frame_extract try: import moviepy.editor as mpy @@ -99,32 +98,6 @@ def visualize(frames, annotations, plate=plate_blue, max_num=5): return frames_out -def frame_extraction(video_path): - """Extract frames given video_path. - - Args: - video_path (str): The video_path. - """ - # Load the video, extract frames into ./tmp/video_name - target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0])) - os.makedirs(target_dir, exist_ok=True) - # Should be able to handle videos up to several hours - frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg') - vid = cv2.VideoCapture(video_path) - frames = [] - frame_paths = [] - flag, frame = vid.read() - cnt = 0 - while flag: - frames.append(frame) - frame_path = frame_tmpl.format(cnt + 1) - frame_paths.append(frame_path) - cv2.imwrite(frame_path, frame) - cnt += 1 - flag, frame = vid.read() - return frame_paths, frames - - def load_label_map(file_path): """Load Label Map. @@ -253,7 +226,9 @@ def parse_args(): def main(): args = parse_args() - frame_paths, original_frames = frame_extraction(args.video) + tmp_dir = tempfile.TemporaryDirectory() + frame_paths, original_frames = frame_extract( + args.video, out_dir=tmp_dir.name) num_frame = len(frame_paths) h, w, _ = original_frames[0].shape @@ -374,8 +349,7 @@ def dense_timestamps(timestamps, n): fps=args.output_fps) vid.write_videofile(args.out_filename) - tmp_frame_dir = osp.dirname(frame_paths[0]) - shutil.rmtree(tmp_frame_dir) + tmp_dir.cleanup() if __name__ == '__main__': diff --git a/mmaction/utils/misc.py b/mmaction/utils/misc.py index 3c34df3f68..f14b8a51c2 100644 --- a/mmaction/utils/misc.py +++ b/mmaction/utils/misc.py @@ -4,6 +4,7 @@ import os.path as osp import random import string +from typing import Optional import cv2 import mmcv @@ -33,18 +34,23 @@ def get_shm_dir() -> str: return '/dev/shm' -def frame_extract(video_path: str, short_side: int): +def frame_extract(video_path: str, + short_side: Optional[int] = None, + out_dir: str = './tmp'): """Extract frames given video_path. Args: video_path (str): The video path. - short_side (int): The short-side of the image. + short_side (int): Target short-side of the output image. + Defaults to None, means keep original shape. + out_dir (str): The output directory. Defaults to ``'./tmp'``. """ - # Load the video, extract frames into ./tmp/video_name - target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0])) + # Load the video, extract frames into OUT_DIR/video_name + target_dir = osp.join(out_dir, osp.basename(osp.splitext(video_path)[0])) os.makedirs(target_dir, exist_ok=True) # Should be able to handle videos up to several hours frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg') + assert osp.exists(video_path), f'file not exit {video_path}' vid = cv2.VideoCapture(video_path) frames = [] frame_paths = [] @@ -52,11 +58,11 @@ def frame_extract(video_path: str, short_side: int): cnt = 0 new_h, new_w = None, None while flag: - if new_h is None: - h, w, _ = frame.shape - new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf)) - - frame = mmcv.imresize(frame, (new_w, new_h)) + if short_side is not None: + if new_h is None: + h, w, _ = frame.shape + new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf)) + frame = mmcv.imresize(frame, (new_w, new_h)) frames.append(frame) frame_path = frame_tmpl.format(cnt + 1) diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py new file mode 100644 index 0000000000..e0886162a6 --- /dev/null +++ b/tests/utils/test_misc.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +from tempfile import TemporaryDirectory + +from mmaction.utils import frame_extract + + +def test_frame_extract(): + data_prefix = osp.normpath(osp.join(osp.dirname(__file__), '../data')) + video_path = osp.join(data_prefix, 'test.mp4') + with TemporaryDirectory() as tmp_dir: + # assign short_side + frame_paths, frames = frame_extract( + video_path, short_side=100, out_dir=tmp_dir) + assert osp.exists(tmp_dir) and \ + len(os.listdir(f'{tmp_dir}/test')) == len(frame_paths) + assert min(frames[0].shape[:2]) == 100 + # default short_side + frame_paths, frames = frame_extract(video_path, out_dir=tmp_dir) + assert osp.exists(tmp_dir) and \ + len(os.listdir(f'{tmp_dir}/test')) == len(frame_paths) From 3807cb3db59eaa063bef76243913a3ae291720c0 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Fri, 24 Feb 2023 17:42:42 +0800 Subject: [PATCH 07/36] [fix] rename fps in DecordInit to avoid overwriting fps in SampleAVAFrame (#2251) --- mmaction/datasets/transforms/loading.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py index 8305a490b8..9e66cd7f3f 100644 --- a/mmaction/datasets/transforms/loading.py +++ b/mmaction/datasets/transforms/loading.py @@ -356,7 +356,7 @@ def transform(self, results: dict) -> dict: total_frames = results['total_frames'] # if can't get fps, same value of `fps` and `target_fps` # will perform nothing - fps = results.get('fps') + fps = results.get('avg_fps') if self.target_fps is None or not fps: fps_scale_ratio = 1.0 else: @@ -1111,7 +1111,7 @@ def transform(self, results): file_obj = io.BytesIO(self.file_client.get(results['filename'])) container = decord.VideoReader(file_obj, num_threads=self.num_threads) - results['fps'] = container.get_avg_fps() + results['avg_fps'] = container.get_avg_fps() results['video_reader'] = container results['total_frames'] = len(container) return results From 951dfc0ff8f4e30fe35e4e093690bf272fbae1e1 Mon Sep 17 00:00:00 2001 From: wxDai Date: Wed, 1 Mar 2023 16:10:46 +0800 Subject: [PATCH 08/36] [Feature] Support RGBPoseConv3D (#2182) --- configs/skeleton/posec3d/README.md | 29 +- configs/skeleton/posec3d/metafile.yml | 26 +- .../skeleton/posec3d/rgbpose_conv3d/README.md | 107 ++++ .../rgbpose_conv3d/merge_pretrain.ipynb | 267 +++++++++ .../posec3d/rgbpose_conv3d/pose_only.py | 127 +++++ .../posec3d/rgbpose_conv3d/rgb_only.py | 126 ++++ .../posec3d/rgbpose_conv3d/rgbpose_conv3d.py | 190 +++++++ ...0_8xb16-u48-120e_hmdb51-split1-keypoint.py | 8 +- ...0_8xb16-u48-120e_ucf101-split1-keypoint.py | 8 +- ...lowonly_r50_8xb16-u48-240e_gym-keypoint.py | 33 +- .../slowonly_r50_8xb16-u48-240e_gym-limb.py | 39 +- ..._r50_8xb16-u48-240e_ntu60-xsub-keypoint.py | 36 +- ...only_r50_8xb16-u48-240e_ntu60-xsub-limb.py | 42 +- mmaction/datasets/pose_dataset.py | 52 +- mmaction/datasets/transforms/__init__.py | 6 +- mmaction/datasets/transforms/formatting.py | 121 +++- mmaction/datasets/transforms/loading.py | 107 ++-- .../datasets/transforms/pose_transforms.py | 538 +++++++++++++----- mmaction/evaluation/metrics/acc_metric.py | 71 ++- mmaction/models/backbones/__init__.py | 4 +- mmaction/models/backbones/resnet3d.py | 204 +++---- .../models/backbones/resnet3d_slowfast.py | 384 ++++++------- .../models/backbones/resnet3d_slowonly.py | 6 - mmaction/models/backbones/rgbposeconv3d.py | 205 +++++++ .../models/data_preprocessors/__init__.py | 3 +- .../data_preprocessors/data_preprocessor.py | 41 +- .../multimodal_data_preprocessor.py | 42 ++ mmaction/models/heads/__init__.py | 3 +- mmaction/models/heads/base.py | 87 +-- mmaction/models/heads/rgbpose_head.py | 240 ++++++++ mmaction/models/recognizers/__init__.py | 3 +- .../models/recognizers/recognizer3d_mm.py | 50 ++ mmaction/models/utils/blending_utils.py | 45 +- tests/datasets/transforms/test_formating.py | 24 +- tests/datasets/transforms/test_loading.py | 16 +- .../transforms/test_pose_transforms.py | 218 +++++-- .../backbones/test_resnet3d_slowfast.py | 29 +- .../backbones/test_resnet3d_slowonly.py | 4 +- tests/models/backbones/test_rgbposeconv3d.py | 27 + tests/models/data_preprocessors/__init__.py | 1 + .../test_data_preprocessor.py | 97 ++++ .../test_multimodal_data_preprocessor.py | 94 +++ tests/models/heads/test_rgbpose_head.py | 41 ++ tools/data/skeleton/compress_nturgbd.py | 42 ++ 44 files changed, 3039 insertions(+), 804 deletions(-) create mode 100644 configs/skeleton/posec3d/rgbpose_conv3d/README.md create mode 100644 configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb create mode 100644 configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py create mode 100644 configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py create mode 100644 configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py create mode 100644 mmaction/models/backbones/rgbposeconv3d.py create mode 100644 mmaction/models/data_preprocessors/multimodal_data_preprocessor.py create mode 100644 mmaction/models/heads/rgbpose_head.py create mode 100644 mmaction/models/recognizers/recognizer3d_mm.py create mode 100644 tests/models/backbones/test_rgbposeconv3d.py create mode 100644 tests/models/data_preprocessors/__init__.py create mode 100644 tests/models/data_preprocessors/test_data_preprocessor.py create mode 100644 tests/models/data_preprocessors/test_multimodal_data_preprocessor.py create mode 100644 tests/models/heads/test_rgbpose_head.py create mode 100644 tools/data/skeleton/compress_nturgbd.py diff --git a/configs/skeleton/posec3d/README.md b/configs/skeleton/posec3d/README.md index 2fe5f579f0..0e45528345 100644 --- a/configs/skeleton/posec3d/README.md +++ b/configs/skeleton/posec3d/README.md @@ -54,29 +54,30 @@ Human skeleton, as a compact representation of human action, has received increa ### FineGYM -| frame sampling strategy | pseudo heatmap | gpus | backbone | Mean Top-1 | testing protocol | config | ckpt | log | -| :---------------------: | :------------: | :--: | :----------: | :--------: | :---------------: | :---------------------------------------: | :--------------------------------------: | :-------------------------------------: | -| uniform 48 | keypoint | 8 | SlowOnly-R50 | 93.4 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint_20220815-da338c58.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint.log) | -| uniform 48 | limb | 8 | SlowOnly-R50 | 93.7 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb_20220815-2e6e3c5c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb.log) | +| frame sampling strategy | pseudo heatmap | gpus | backbone | Mean Top-1 | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :------------: | :--: | :----------: | :--------: | :--------------: | :---: | :----: | :------------------------------------: | :----------------------------------: | :----------------------------------: | +| uniform 48 | keypoint | 8 | SlowOnly-R50 | 93.5 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint_20220815-da338c58.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint.log) | +| uniform 48 | limb | 8 | SlowOnly-R50 | 93.6 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb_20220815-2e6e3c5c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb.log) | ### NTU60_XSub -| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | config | ckpt | log | -| :---------------------: | :------------: | :--: | :----------: | :------: | :---------------: | :----------------------------------------: | :--------------------------------------: | :--------------------------------------: | -| uniform 48 | keypoint | 8 | SlowOnly-R50 | 93.6 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint_20220815-38db104b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.log) | -| uniform 48 | limb | 8 | SlowOnly-R50 | 93.5 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb_20220815-af2f119a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.log) | +| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: | +| uniform 48 | keypoint | 8 | SlowOnly-R50 | 93.6 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint_20220815-38db104b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.log) | +| uniform 48 | limb | 8 | SlowOnly-R50 | 93.5 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb_20220815-af2f119a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.log) | +| | Fusion | | | 94.0 | | | | | | | ### UCF101 -| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | config | ckpt | log | -| :---------------------: | :------------: | :--: | :----------: | :------: | :---------------: | :----------------------------------------: | :--------------------------------------: | :--------------------------------------: | -| uniform 48 | keypoint | 8 | SlowOnly-R50 | 86.9 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint_20220815-9972260d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.log) | +| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: | +| uniform 48 | keypoint | 8 | SlowOnly-R50 | 86.8 | 10 clips | 14.6G | 3.1M | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint_20220815-9972260d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.log) | ### HMDB51 -| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | config | ckpt | log | -| :---------------------: | :------------: | :--: | :----------: | :------: | :---------------: | :----------------------------------------: | :--------------------------------------: | :--------------------------------------: | -| uniform 48 | keypoint | 8 | SlowOnly-R50 | 69.2 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint_20220815-17eaa484.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.log) | +| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: | +| uniform 48 | keypoint | 8 | SlowOnly-R50 | 69.6 | 10 clips | 14.6G | 3.0M | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint_20220815-17eaa484.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.log) | 1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, diff --git a/configs/skeleton/posec3d/metafile.yml b/configs/skeleton/posec3d/metafile.yml index 7a3d3b9b20..b949a23d47 100644 --- a/configs/skeleton/posec3d/metafile.yml +++ b/configs/skeleton/posec3d/metafile.yml @@ -13,7 +13,8 @@ Models: Architecture: SlowOnly-R50 Batch Size: 16 Epochs: 240 - Parameters: 2044867 + FLOPs: 20.6G + Parameters: 2.0M Training Data: FineGYM Training Resources: 8 GPUs pseudo heatmap: keypoint @@ -21,7 +22,7 @@ Models: - Dataset: FineGYM Task: Skeleton-based Action Recognition Metrics: - mean Top 1 Accuracy: 93.4 + mean Top 1 Accuracy: 93.5 Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint.log Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint_20220815-da338c58.pth @@ -32,7 +33,8 @@ Models: Architecture: SlowOnly-R50 Batch Size: 16 Epochs: 240 - Parameters: 2044867 + FLOPs: 20.6G + Parameters: 2.0M Training Data: FineGYM Training Resources: 8 GPUs pseudo heatmap: limb @@ -40,7 +42,7 @@ Models: - Dataset: FineGYM Task: Skeleton-based Action Recognition Metrics: - mean Top 1 Accuracy: 93.7 + mean Top 1 Accuracy: 93.6 Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb.log Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb_20220815-2e6e3c5c.pth @@ -51,7 +53,8 @@ Models: Architecture: SlowOnly-R50 Batch Size: 16 Epochs: 240 - Parameters: 2024860 + FLOPs: 20.6G + Parameters: 2.0M Training Data: NTU60-XSub Training Resources: 8 GPUs pseudo heatmap: keypoint @@ -70,7 +73,8 @@ Models: Architecture: SlowOnly-R50 Batch Size: 16 Epochs: 240 - Parameters: 2024860 + FLOPs: 20.6G + Parameters: 2.0M Training Data: NTU60-XSub Training Resources: 8 GPUs pseudo heatmap: limb @@ -89,7 +93,8 @@ Models: Architecture: SlowOnly-R50 Batch Size: 16 Epochs: 120 - Parameters: 3029984 + FLOPs: 14.6G + Parameters: 3.0M Training Data: HMDB51 Training Resources: 8 GPUs pseudo heatmap: keypoint @@ -97,7 +102,7 @@ Models: - Dataset: HMDB51 Task: Skeleton-based Action Recognition Metrics: - Top 1 Accuracy: 69.2 + Top 1 Accuracy: 69.6 Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.log Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint_20220815-17eaa484.pth @@ -108,7 +113,8 @@ Models: Architecture: SlowOnly-R50 Batch Size: 16 Epochs: 120 - Parameters: 3055584 + FLOPs: 14.6G + Parameters: 3.1M Training Data: UCF101 Training Resources: 8 GPUs pseudo heatmap: keypoint @@ -116,6 +122,6 @@ Models: - Dataset: UCF101 Task: Skeleton-based Action Recognition Metrics: - Top 1 Accuracy: 86.9 + Top 1 Accuracy: 86.8 Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.log Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint_20220815-9972260d.pth diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/README.md b/configs/skeleton/posec3d/rgbpose_conv3d/README.md new file mode 100644 index 0000000000..37b4cd489d --- /dev/null +++ b/configs/skeleton/posec3d/rgbpose_conv3d/README.md @@ -0,0 +1,107 @@ +# RGBPoseConv3D + +## Introduction + +RGBPoseConv3D is a framework that jointly use 2D human skeletons and RGB appearance for human action recognition. It is a 3D CNN with two streams, with the architecture borrowed from SlowFast. In RGBPoseConv3D: + +- The RGB stream corresponds to the `slow` stream in SlowFast; The Skeleton stream corresponds to the `fast` stream in SlowFast. +- The input resolution of RGB frames is `4x` larger than the pseudo heatmaps. +- Bilateral connections are used for early feature fusion between the two modalities. + +
+ +
+ +## Citation + +```BibTeX +@inproceedings{duan2022revisiting, + title={Revisiting skeleton-based action recognition}, + author={Duan, Haodong and Zhao, Yue and Chen, Kai and Lin, Dahua and Dai, Bo}, + booktitle={CVPR}, + pages={2969--2978}, + year={2022} +} +``` + +## How to train RGBPoseConv3D (on NTURGB+D, for example)? + +#### Step 0. Data Preparation + +Besides the skeleton annotations, you also need RGB videos to train RGBPoseConv3D. You need to download them from the official website of [NTURGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) and put these videos in `$MMACTION2/data/nturgbd_raw`. After that, you need to use the provided script to compress the raw videos (from `1920x1080` to `960x540`) and switch the suffix to `.mp4`: + +```bash +# That step is mandatory, unless you know how to modify the code & config to make it work for raw videos! +python tools/data/skeleton/compress_nturgbd.py +``` + +After that, you will find processed videos in `$MMACTION2/data/nturgbd_videos`, named like `S001C001P001R001A001.mp4`. + +#### Step 1. Pretraining + +You first need to train the RGB-only and Pose-only model on the target dataset, the pretrained checkpoints will be used to initialize the RGBPoseConv3D model. + +You can either train these two models from scratch with provided configs files: + +```bash +# We train each model for 180 epochs. By default, we use 8 GPUs. +# Train the RGB-only model +bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py 8 +# Train the Pose-only model +bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py 8 +``` + +or directly download and use the provided pretrain models: + +| Dataset | Config | Checkpoint | Top-1 (1 clip testing) | Top-1 (10 clip testing) | +| :-----------: | :------------------------------------------------------------------: | :------------------------------------------------------------------------: | :--------------------: | :---------------------: | +| NTURGB+D XSub | [rgb_config](/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py) | [rgb_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth) | 94.9 | 95.4 | +| NTURGB+D XSub | [pose_config](/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py) | [pose_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth) | 93.1 | 93.5 | + +#### Step 2. Generate the initializing weight for RGBPoseConv3D + +You can use the provided [IPython notebook](/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb) to merge two pretrained models into a single `rgbpose_conv3d_init.pth`. + +You can do it your own or directly download and use the provided [rgbpose_conv3d_init.pth](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_init_20230228-09b7684b.pth). + +#### Step 3. Finetune RGBPoseConv3D + +You can use our provided config files to finetune RGBPoseConv3D, jointly with two modalities (RGB & Pose): + +```bash +# We finetune RGBPoseConv3D for 20 epochs on NTURGB+D XSub (8 GPUs) +bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py 8 +# After finetuning, you can test the model with the following command (8 GPUs) +bash tools/dist_test.sh configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py $CKPT 8 --dump result.pkl +``` + +**Notes** + +1. We use linear scaling learning rate (`Initial LR` ∝ `Batch Size`). If you change the training batch size, remember to change the initial LR proportionally. + +2. Though optimized, multi-clip testing may consumes large amounts of time. For faster inference, you may change the test_pipeline to disable the multi-clip testing, this may lead to a small drop in recognition performance. Below is the guide: + + ```python + test_pipeline = [ + dict(type='MMUniformSampleFrames', clip_len=dict(RGB=8, Pose=32), num_clips=10, test_mode=True), # change `num_clips=10` to `num_clips=1` + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(256, 256), keep_ratio=False), + dict(type='GeneratePoseTarget', sigma=0.7, use_score=True, with_kp=True, with_limb=False, scaling=0.25), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs')) + ] + ``` + +## Results + +On action recognition with multiple modalities (RGB & Pose), RGBPoseConv3D can achieve better recognition performance than the late fusion baseline. + +| Dataset | Fusion | Config | Checkpoint | RGB Stream Top-1
(1-clip / 10-clip) | Pose Stream Top-1
(1-clip / 10-clip) | 2 Stream Top-1 (1:1)
(1-clip / 10-clip) | +| :-----------: | :-------------------: | :-------------------: | :------------------------: | :------------------------------------: | :-------------------------------------: | :----------------------------------------: | +| NTURGB+D XSub | Late Fusion | [rgb_config](/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py)
[pose_config](/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py) | [rgb_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth)
[pose_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth) | 94.9 / 95.4 | 93.1 / 93.5 | 96.0 / 96.2 | +| NTURGB+D XSub | Early Fusion + Late Fusion | [config](/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_20230301-ac7b0e77.pth) | 96.2 / 96.4 | 96.0 / 96.2 | 96.6 / 96.8 | + +**Notes** + +For both `Late Fusion` and `Early Fusion + Late Fusion`, we combine the action scores based on two modalities with 1:1 ratio to get the final prediction. diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb b/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb new file mode 100644 index 0000000000..194ca28c31 --- /dev/null +++ b/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb @@ -0,0 +1,267 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import copy as cp\n", + "from collections import OrderedDict\n", + "\n", + "import torch\n", + "from mmengine.runner.checkpoint import _load_checkpoint\n", + "\n", + "from mmaction.utils import register_all_modules\n", + "from mmaction.registry import MODELS" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [], + "source": [ + "backbone_cfg = dict(\n", + " type='RGBPoseConv3D',\n", + " speed_ratio=4,\n", + " channel_ratio=4,\n", + " rgb_pathway=dict(\n", + " num_stages=4,\n", + " lateral=True,\n", + " lateral_infl=1,\n", + " lateral_activate=[0, 0, 1, 1],\n", + " fusion_kernel=7,\n", + " base_channels=64,\n", + " conv1_kernel=(1, 7, 7),\n", + " inflate=(0, 0, 1, 1),\n", + " with_pool2=False),\n", + " pose_pathway=dict(\n", + " num_stages=3,\n", + " stage_blocks=(4, 6, 3),\n", + " lateral=True,\n", + " lateral_inv=True,\n", + " lateral_infl=16,\n", + " lateral_activate=(0, 1, 1),\n", + " fusion_kernel=7,\n", + " in_channels=17,\n", + " base_channels=32,\n", + " out_indices=(2, ),\n", + " conv1_kernel=(1, 7, 7),\n", + " conv1_stride_s=1,\n", + " conv1_stride_t=1,\n", + " pool1_stride_s=1,\n", + " pool1_stride_t=1,\n", + " inflate=(0, 1, 1),\n", + " spatial_strides=(2, 2, 2),\n", + " temporal_strides=(1, 1, 1),\n", + " dilations=(1, 1, 1),\n", + " with_pool2=False))\n", + "head_cfg = dict(\n", + " type='RGBPoseHead',\n", + " num_classes=60,\n", + " in_channels=[2048, 512],\n", + " average_clips='prob')\n", + "model_cfg = dict(\n", + " type='Recognizer3D',\n", + " backbone=backbone_cfg,\n", + " cls_head=head_cfg)\n", + "\n", + "register_all_modules()\n", + "model = MODELS.build(model_cfg)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [], + "source": [ + "# set your paths of the pretrained weights here\n", + "rgb_filepath = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth'\n", + "pose_filepath = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth'" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230226-8bd9d8df.pth\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230226-8bd9d8df.pth\" to C:\\Users\\wxDai/.cache\\torch\\hub\\checkpoints\\rgb_only_20230226-8bd9d8df.pth\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230226-fa40054e.pth\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230226-fa40054e.pth\" to C:\\Users\\wxDai/.cache\\torch\\hub\\checkpoints\\pose_only_20230226-fa40054e.pth\n" + ] + } + ], + "source": [ + "rgb_ckpt = _load_checkpoint(rgb_filepath, map_location='cpu')['state_dict']\n", + "pose_ckpt = _load_checkpoint(pose_filepath, map_location='cpu')['state_dict']" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [], + "source": [ + "rgb_ckpt = {k.replace('backbone', 'backbone.rgb_path').replace('fc_cls', 'fc_rgb'): v for k, v in rgb_ckpt.items()}\n", + "pose_ckpt = {k.replace('backbone', 'backbone.pose_path').replace('fc_cls', 'fc_pose'): v for k, v in pose_ckpt.items()}" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "old_ckpt = {}\n", + "old_ckpt.update(rgb_ckpt)\n", + "old_ckpt.update(pose_ckpt)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [], + "source": [ + "# The difference is in dim-1\n", + "def padding(weight, new_shape):\n", + " new_weight = weight.new_zeros(new_shape)\n", + " new_weight[:, :weight.shape[1]] = weight\n", + " return new_weight" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [], + "source": [ + "ckpt = cp.deepcopy(old_ckpt)\n", + "name = 'backbone.rgb_path.layer3.0.conv1.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (256, 640, 3, 1, 1))\n", + "name = 'backbone.rgb_path.layer3.0.downsample.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (1024, 640, 1, 1, 1))\n", + "name = 'backbone.rgb_path.layer4.0.conv1.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (512, 1280, 3, 1, 1))\n", + "name = 'backbone.rgb_path.layer4.0.downsample.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (2048, 1280, 1, 1, 1))\n", + "name = 'backbone.pose_path.layer2.0.conv1.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (64, 160, 3, 1, 1))\n", + "name = 'backbone.pose_path.layer2.0.downsample.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (256, 160, 1, 1, 1))\n", + "name = 'backbone.pose_path.layer3.0.conv1.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (128, 320, 3, 1, 1))\n", + "name = 'backbone.pose_path.layer3.0.downsample.conv.weight'\n", + "ckpt[name] = padding(ckpt[name], (512, 320, 1, 1, 1))\n", + "ckpt = OrderedDict(ckpt)\n", + "torch.save({'state_dict': ckpt}, 'rgbpose_conv3d_init.pth')" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 9, + "outputs": [ + { + "data": { + "text/plain": "_IncompatibleKeys(missing_keys=['backbone.rgb_path.layer2_lateral.conv.weight', 'backbone.rgb_path.layer3_lateral.conv.weight', 'backbone.pose_path.layer1_lateral.conv.weight', 'backbone.pose_path.layer1_lateral.bn.weight', 'backbone.pose_path.layer1_lateral.bn.bias', 'backbone.pose_path.layer1_lateral.bn.running_mean', 'backbone.pose_path.layer1_lateral.bn.running_var', 'backbone.pose_path.layer2_lateral.conv.weight', 'backbone.pose_path.layer2_lateral.bn.weight', 'backbone.pose_path.layer2_lateral.bn.bias', 'backbone.pose_path.layer2_lateral.bn.running_mean', 'backbone.pose_path.layer2_lateral.bn.running_var'], unexpected_keys=[])" + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.load_state_dict(ckpt, strict=False)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py b/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py new file mode 100644 index 0000000000..ad413da6a6 --- /dev/null +++ b/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py @@ -0,0 +1,127 @@ +_base_ = '../../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + in_channels=17, + base_channels=32, + num_stages=3, + out_indices=(2, ), + stage_blocks=(4, 6, 3), + conv1_stride_s=1, + pool1_stride_s=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 1), + dilations=(1, 1, 1)), + cls_head=dict( + type='I3DHead', + in_channels=512, + num_classes=60, + dropout_ratio=0.5, + average_clips='prob')) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +train_pipeline = [ + dict(type='UniformSampleFrames', clip_len=32), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(64, 64), keep_ratio=False), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(56, 56), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict(type='GeneratePoseTarget', with_kp=True, with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='UniformSampleFrames', clip_len=32, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(64, 64), keep_ratio=False), + dict(type='GeneratePoseTarget', with_kp=True, with_limb=False), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='UniformSampleFrames', clip_len=32, num_clips=10, test_mode=True), + dict(type='PoseDecode'), + dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(64, 64), keep_ratio=False), + dict( + type='GeneratePoseTarget', + with_kp=True, + with_limb=False, + left_kp=left_kp, + right_kp=right_kp), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_train', + pipeline=train_pipeline))) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=18, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=18, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py b/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py new file mode 100644 index 0000000000..331badaf8d --- /dev/null +++ b/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py @@ -0,0 +1,126 @@ +_base_ = '../../../_base_/default_runtime.py' + +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + conv1_kernel=(1, 7, 7), + inflate=(0, 0, 1, 1)), + cls_head=dict( + type='I3DHead', + in_channels=2048, + num_classes=60, + dropout_ratio=0.5, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW')) + +dataset_type = 'PoseDataset' +data_root = 'data/nturgbd_videos/' +ann_file = 'data/skeleton/ntu60_2d.pkl' + +train_pipeline = [ + dict(type='MMUniformSampleFrames', clip_len=dict(RGB=8), num_clips=1), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(256, 256), keep_ratio=False), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict( + type='MMUniformSampleFrames', + clip_len=dict(RGB=8), + num_clips=1, + test_mode=True), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict( + type='MMUniformSampleFrames', + clip_len=dict(RGB=8), + num_clips=10, + test_mode=True), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=12, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + data_prefix=dict(video=data_root), + split='xsub_train', + pipeline=train_pipeline))) +val_dataloader = dict( + batch_size=12, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + data_prefix=dict(video=data_root), + split='xsub_val', + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + data_prefix=dict(video=data_root), + split='xsub_val', + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=18, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=18, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (12 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=96) diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py b/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py new file mode 100644 index 0000000000..d303699f90 --- /dev/null +++ b/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py @@ -0,0 +1,190 @@ +_base_ = '../../../_base_/default_runtime.py' + +# model_cfg +backbone_cfg = dict( + type='RGBPoseConv3D', + speed_ratio=4, + channel_ratio=4, + rgb_pathway=dict( + num_stages=4, + lateral=True, + lateral_infl=1, + lateral_activate=[0, 0, 1, 1], + fusion_kernel=7, + base_channels=64, + conv1_kernel=(1, 7, 7), + inflate=(0, 0, 1, 1), + with_pool2=False), + pose_pathway=dict( + num_stages=3, + stage_blocks=(4, 6, 3), + lateral=True, + lateral_inv=True, + lateral_infl=16, + lateral_activate=(0, 1, 1), + fusion_kernel=7, + in_channels=17, + base_channels=32, + out_indices=(2, ), + conv1_kernel=(1, 7, 7), + conv1_stride_s=1, + conv1_stride_t=1, + pool1_stride_s=1, + pool1_stride_t=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 1), + dilations=(1, 1, 1), + with_pool2=False)) +head_cfg = dict( + type='RGBPoseHead', + num_classes=60, + in_channels=[2048, 512], + loss_components=['rgb', 'pose'], + loss_weights=[1., 1.], + average_clips='prob') +data_preprocessor = dict( + type='MultiModalDataPreprocessor', + preprocessors=dict( + imgs=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + heatmap_imgs=dict(type='ActionDataPreprocessor'))) +model = dict( + type='MMRecognizer3D', + backbone=backbone_cfg, + cls_head=head_cfg, + data_preprocessor=data_preprocessor) + +dataset_type = 'PoseDataset' +data_root = 'data/nturgbd_videos/' +ann_file = 'data/skeleton/ntu60_2d.pkl' +left_kp = [1, 3, 5, 7, 9, 11, 13, 15] +right_kp = [2, 4, 6, 8, 10, 12, 14, 16] +train_pipeline = [ + dict( + type='MMUniformSampleFrames', + clip_len=dict(RGB=8, Pose=32), + num_clips=1), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(256, 256), keep_ratio=False), + dict(type='RandomResizedCrop', area_range=(0.56, 1.0)), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp), + dict( + type='GeneratePoseTarget', + sigma=0.7, + use_score=True, + with_kp=True, + with_limb=False, + scaling=0.25), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs')) +] +val_pipeline = [ + dict( + type='MMUniformSampleFrames', + clip_len=dict(RGB=8, Pose=32), + num_clips=1, + test_mode=True), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(256, 256), keep_ratio=False), + dict( + type='GeneratePoseTarget', + sigma=0.7, + use_score=True, + with_kp=True, + with_limb=False, + scaling=0.25), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs')) +] +test_pipeline = [ + dict( + type='MMUniformSampleFrames', + clip_len=dict(RGB=8, Pose=32), + num_clips=10, + test_mode=True), + dict(type='MMDecode'), + dict(type='MMCompact', hw_ratio=1., allow_imgpad=True), + dict(type='Resize', scale=(256, 256), keep_ratio=False), + dict( + type='GeneratePoseTarget', + sigma=0.7, + use_score=True, + with_kp=True, + with_limb=False, + scaling=0.25), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs')) +] + +train_dataloader = dict( + batch_size=6, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + data_prefix=dict(video=data_root), + split='xsub_train', + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + data_prefix=dict(video=data_root), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_val', + data_prefix=dict(video=data_root), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.0075, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[12, 16], + gamma=0.1) +] + +load_from = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_init_20230228-09b7684b.pth' # noqa: E501 + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (6 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=48) diff --git a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py index 123db1ee1f..e213e3319c 100644 --- a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py +++ b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py @@ -28,7 +28,7 @@ test_cfg=None) dataset_type = 'PoseDataset' -ann_file = 'data/posec3d/hmdb51.pkl' +ann_file = 'data/skeleton/hmdb51_2d.pkl' left_kp = [1, 3, 5, 7, 9, 11, 13, 15] right_kp = [2, 4, 6, 8, 10, 12, 14, 16] train_pipeline = [ @@ -45,7 +45,7 @@ use_score=True, with_kp=True, with_limb=False), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] val_pipeline = [ @@ -60,7 +60,7 @@ use_score=True, with_kp=True, with_limb=False), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] test_pipeline = [ @@ -79,7 +79,7 @@ double=True, left_kp=left_kp, right_kp=right_kp), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] diff --git a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py index 547f57c052..c100754fa5 100644 --- a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py +++ b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py @@ -28,7 +28,7 @@ test_cfg=None) dataset_type = 'PoseDataset' -ann_file = 'data/posec3d/ucf101.pkl' +ann_file = 'data/skeleton/ucf101_2d.pkl' left_kp = [1, 3, 5, 7, 9, 11, 13, 15] right_kp = [2, 4, 6, 8, 10, 12, 14, 16] train_pipeline = [ @@ -45,7 +45,7 @@ use_score=True, with_kp=True, with_limb=False), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] val_pipeline = [ @@ -60,7 +60,7 @@ use_score=True, with_kp=True, with_limb=False), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] test_pipeline = [ @@ -79,7 +79,7 @@ double=True, left_kp=left_kp, right_kp=right_kp), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py index c893f69df3..8517870d1c 100644 --- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py +++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py @@ -23,13 +23,10 @@ num_classes=99, spatial_type='avg', dropout_ratio=0.5, - average_clips='prob'), - train_cfg=None, - test_cfg=None) + average_clips='prob')) dataset_type = 'PoseDataset' -ann_file_train = 'data/posec3d/gym_train.pkl' -ann_file_val = 'data/posec3d/gym_val.pkl' +ann_file = 'data/skeleton/gym_2d.pkl' left_kp = [1, 3, 5, 7, 9, 11, 13, 15] right_kp = [2, 4, 6, 8, 10, 12, 14, 16] train_pipeline = [ @@ -46,7 +43,7 @@ use_score=True, with_kp=True, with_limb=False), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] val_pipeline = [ @@ -61,7 +58,7 @@ use_score=True, with_kp=True, with_limb=False), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] test_pipeline = [ @@ -80,7 +77,7 @@ double=True, left_kp=left_kp, right_kp=right_kp), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] @@ -90,7 +87,13 @@ persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( - type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline)) + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='train', + pipeline=train_pipeline))) val_dataloader = dict( batch_size=16, num_workers=8, @@ -98,7 +101,8 @@ sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, - ann_file=ann_file_val, + ann_file=ann_file, + split='val', pipeline=val_pipeline, test_mode=True)) test_dataloader = dict( @@ -108,7 +112,8 @@ sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, - ann_file=ann_file_val, + ann_file=ann_file, + split='val', pipeline=test_pipeline, test_mode=True)) @@ -116,7 +121,7 @@ test_evaluator = val_evaluator train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10) + type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') @@ -124,7 +129,7 @@ dict( type='CosineAnnealingLR', eta_min=0, - T_max=240, + T_max=24, by_epoch=True, convert_to_iter_based=True) ] @@ -132,5 +137,3 @@ optim_wrapper = dict( optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003), clip_grad=dict(max_norm=40, norm_type=2)) - -default_hooks = dict(checkpoint=dict(interval=10, max_keep_ckpts=3)) diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py index 34764a726e..0ab9263951 100644 --- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py +++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py @@ -23,18 +23,17 @@ num_classes=99, spatial_type='avg', dropout_ratio=0.5, - average_clips='prob'), - train_cfg=None, - test_cfg=None) + average_clips='prob')) dataset_type = 'PoseDataset' -ann_file_train = 'data/posec3d/gym_train.pkl' -ann_file_val = 'data/posec3d/gym_val.pkl' +ann_file = 'data/skeleton/gym_2d.pkl' left_kp = [1, 3, 5, 7, 9, 11, 13, 15] right_kp = [2, 4, 6, 8, 10, 12, 14, 16] skeletons = [[0, 5], [0, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11], [11, 13], [13, 15], [6, 12], [12, 14], [14, 16], [0, 1], [0, 2], [1, 3], [2, 4], [11, 12]] +left_limb = [0, 2, 3, 6, 7, 8, 12, 14] +right_limb = [1, 4, 5, 9, 10, 11, 13, 15] train_pipeline = [ dict(type='UniformSampleFrames', clip_len=48), dict(type='PoseDecode'), @@ -50,7 +49,7 @@ with_kp=False, with_limb=True, skeletons=skeletons), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] val_pipeline = [ @@ -66,7 +65,7 @@ with_kp=False, with_limb=True, skeletons=skeletons), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] test_pipeline = [ @@ -85,8 +84,10 @@ skeletons=skeletons, double=True, left_kp=left_kp, - right_kp=right_kp), - dict(type='FormatShape', input_format='NCTHW'), + right_kp=right_kp, + left_limb=left_limb, + right_limb=right_limb), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] @@ -96,7 +97,13 @@ persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( - type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline)) + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='train', + pipeline=train_pipeline))), val_dataloader = dict( batch_size=16, num_workers=8, @@ -104,7 +111,8 @@ sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, - ann_file=ann_file_val, + ann_file=ann_file, + split='val', pipeline=val_pipeline, test_mode=True)) test_dataloader = dict( @@ -114,7 +122,8 @@ sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, - ann_file=ann_file_val, + ann_file=ann_file, + split='val', pipeline=test_pipeline, test_mode=True)) @@ -122,7 +131,7 @@ test_evaluator = val_evaluator train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10) + type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') @@ -130,7 +139,7 @@ dict( type='CosineAnnealingLR', eta_min=0, - T_max=240, + T_max=24, by_epoch=True, convert_to_iter_based=True) ] @@ -138,5 +147,3 @@ optim_wrapper = dict( optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003), clip_grad=dict(max_norm=40, norm_type=2)) - -default_hooks = dict(checkpoint=dict(interval=10, max_keep_ckpts=3)) diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py index 2194139f5e..c4915d4d2e 100644 --- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py +++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py @@ -21,15 +21,11 @@ type='I3DHead', in_channels=512, num_classes=60, - spatial_type='avg', dropout_ratio=0.5, - average_clips='prob'), - train_cfg=None, - test_cfg=None) + average_clips='prob')) dataset_type = 'PoseDataset' -ann_file_train = 'data/posec3d/ntu60_xsub_train.pkl' -ann_file_val = 'data/posec3d/ntu60_xsub_val.pkl' +ann_file = 'data/skeleton/ntu60_2d.pkl' left_kp = [1, 3, 5, 7, 9, 11, 13, 15] right_kp = [2, 4, 6, 8, 10, 12, 14, 16] train_pipeline = [ @@ -46,7 +42,7 @@ use_score=True, with_kp=True, with_limb=False), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] val_pipeline = [ @@ -61,7 +57,7 @@ use_score=True, with_kp=True, with_limb=False), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] test_pipeline = [ @@ -80,7 +76,7 @@ double=True, left_kp=left_kp, right_kp=right_kp), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] @@ -90,7 +86,13 @@ persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( - type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline)) + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_train', + pipeline=train_pipeline))) val_dataloader = dict( batch_size=16, num_workers=8, @@ -98,7 +100,8 @@ sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, - ann_file=ann_file_val, + ann_file=ann_file, + split='xsub_val', pipeline=val_pipeline, test_mode=True)) test_dataloader = dict( @@ -108,15 +111,16 @@ sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, - ann_file=ann_file_val, + ann_file=ann_file, + split='xsub_val', pipeline=test_pipeline, test_mode=True)) -val_evaluator = dict(type='AccMetric') +val_evaluator = [dict(type='AccMetric')] test_evaluator = val_evaluator train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10) + type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') @@ -124,7 +128,7 @@ dict( type='CosineAnnealingLR', eta_min=0, - T_max=240, + T_max=24, by_epoch=True, convert_to_iter_based=True) ] @@ -132,5 +136,3 @@ optim_wrapper = dict( optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003), clip_grad=dict(max_norm=40, norm_type=2)) - -default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py index 7eca1463ee..0f4f11f3a0 100644 --- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py +++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py @@ -21,20 +21,18 @@ type='I3DHead', in_channels=512, num_classes=60, - spatial_type='avg', dropout_ratio=0.5, - average_clips='prob'), - train_cfg=None, - test_cfg=None) + average_clips='prob')) dataset_type = 'PoseDataset' -ann_file_train = 'data/posec3d/ntu60_xsub_train.pkl' -ann_file_val = 'data/posec3d/ntu60_xsub_val.pkl' +ann_file = 'data/skeleton/ntu60_2d.pkl' left_kp = [1, 3, 5, 7, 9, 11, 13, 15] right_kp = [2, 4, 6, 8, 10, 12, 14, 16] skeletons = [[0, 5], [0, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11], [11, 13], [13, 15], [6, 12], [12, 14], [14, 16], [0, 1], [0, 2], [1, 3], [2, 4], [11, 12]] +left_limb = [0, 2, 3, 6, 7, 8, 12, 14] +right_limb = [1, 4, 5, 9, 10, 11, 13, 15] train_pipeline = [ dict(type='UniformSampleFrames', clip_len=48), dict(type='PoseDecode'), @@ -50,7 +48,7 @@ with_kp=False, with_limb=True, skeletons=skeletons), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] val_pipeline = [ @@ -66,7 +64,7 @@ with_kp=False, with_limb=True, skeletons=skeletons), - dict(type='FormatShape', input_format='NCTHW'), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] test_pipeline = [ @@ -84,9 +82,9 @@ with_limb=True, skeletons=skeletons, double=True, - left_kp=left_kp, - right_kp=right_kp), - dict(type='FormatShape', input_format='NCTHW'), + left_limb=left_limb, + right_limb=right_limb), + dict(type='FormatShape', input_format='NCTHW_Heatmap'), dict(type='PackActionInputs') ] @@ -96,7 +94,13 @@ persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( - type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline)) + type='RepeatDataset', + times=10, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + split='xsub_train', + pipeline=train_pipeline))) val_dataloader = dict( batch_size=16, num_workers=8, @@ -104,7 +108,8 @@ sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, - ann_file=ann_file_val, + ann_file=ann_file, + split='xsub_val', pipeline=val_pipeline, test_mode=True)) test_dataloader = dict( @@ -114,15 +119,16 @@ sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, - ann_file=ann_file_val, + ann_file=ann_file, + split='xsub_val', pipeline=test_pipeline, test_mode=True)) -val_evaluator = dict(type='AccMetric') +val_evaluator = [dict(type='AccMetric')] test_evaluator = val_evaluator train_cfg = dict( - type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10) + type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') @@ -130,7 +136,7 @@ dict( type='CosineAnnealingLR', eta_min=0, - T_max=240, + T_max=24, by_epoch=True, convert_to_iter_based=True) ] @@ -138,5 +144,3 @@ optim_wrapper = dict( optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003), clip_grad=dict(max_norm=40, norm_type=2)) - -default_hooks = dict(checkpoint=dict(max_keep_ckpts=3)) diff --git a/mmaction/datasets/pose_dataset.py b/mmaction/datasets/pose_dataset.py index 52c2c0b668..a06a7f7c0d 100644 --- a/mmaction/datasets/pose_dataset.py +++ b/mmaction/datasets/pose_dataset.py @@ -1,10 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Callable, List, Optional, Union +import os.path as osp +from typing import Callable, Dict, List, Optional, Union -from mmengine.fileio import exists, load +import mmengine from mmaction.registry import DATASETS -from mmaction.utils import ConfigType from .base import BaseActionDataset @@ -21,38 +21,44 @@ class PoseDataset(BaseActionDataset): Args: ann_file (str): Path to the annotation file. - pipeline (list): A sequence of data transforms. - split (str, optional): The dataset split used. Only applicable to - ``UCF`` or ``HMDB``. Allowed choices are ``train1``, ``test1``, - ``train2``, ``test2``, ``train3``, ``test3``. Defaults to None. - start_index (int): Specify a start index for frames in consideration of - different filename format. Defaults to 0. + pipeline (list[dict | callable]): A sequence of data transforms. + split (str, optional): The dataset split used. For UCF101 and + HMDB51, allowed choices are 'train1', 'test1', 'train2', + 'test2', 'train3', 'test3'. For NTURGB+D, allowed choices + are 'xsub_train', 'xsub_val', 'xview_train', 'xview_val'. + For NTURGB+D 120, allowed choices are 'xsub_train', + 'xsub_val', 'xset_train', 'xset_val'. For FineGYM, + allowed choices are 'train', 'val'. Defaults to None. """ def __init__(self, ann_file: str, - pipeline: List[Union[ConfigType, Callable]], + pipeline: List[Union[Dict, Callable]], split: Optional[str] = None, - start_index: int = 0, **kwargs) -> None: - # split, applicable to ``ucf101`` or ``hmdb51`` self.split = split super().__init__( - ann_file, - pipeline=pipeline, - start_index=start_index, - modality='Pose', - **kwargs) + ann_file, pipeline=pipeline, modality='Pose', **kwargs) - def load_data_list(self) -> List[dict]: + def load_data_list(self) -> List[Dict]: """Load annotation file to get skeleton information.""" assert self.ann_file.endswith('.pkl') - exists(self.ann_file) - data_list = load(self.ann_file) + mmengine.exists(self.ann_file) + data_list = mmengine.load(self.ann_file) if self.split is not None: - split, data = data_list['split'], data_list['annotations'] - identifier = 'filename' if 'filename' in data[0] else 'frame_dir' - data_list = [x for x in data if x[identifier] in split[self.split]] + split, annos = data_list['split'], data_list['annotations'] + identifier = 'filename' if 'filename' in annos[0] else 'frame_dir' + split = set(split[self.split]) + data_list = [x for x in annos if x[identifier] in split] + # Sometimes we may need to load video from the file + if 'video' in self.data_prefix: + for item in data_list: + if 'filename' in item: + item['filename'] = osp.join(self.data_prefix['video'], + item['filename']) + if 'frame_dir' in item: + item['frame_dir'] = osp.join(self.data_prefix['video'], + item['frame_dir']) return data_list diff --git a/mmaction/datasets/transforms/__init__.py b/mmaction/datasets/transforms/__init__.py index 198bd8c781..2b83c415f5 100644 --- a/mmaction/datasets/transforms/__init__.py +++ b/mmaction/datasets/transforms/__init__.py @@ -11,7 +11,8 @@ PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames, UniformSample, UntrimmedSampleFrames) from .pose_transforms import (GeneratePoseTarget, GenSkeFeat, JointToBone, - LoadKineticsPose, MergeSkeFeat, PadTo, + LoadKineticsPose, MergeSkeFeat, MMCompact, + MMDecode, MMUniformSampleFrames, PadTo, PoseCompact, PoseDecode, PreNormalize2D, PreNormalize3D, ToMotion, UniformSampleFrames) from .processing import (AudioAmplify, CenterCrop, ColorJitter, Flip, Fuse, @@ -36,5 +37,6 @@ 'RandomCrop', 'RandomRescale', 'RandomResizedCrop', 'RawFrameDecode', 'Resize', 'SampleAVAFrames', 'SampleFrames', 'TenCrop', 'ThreeCrop', 'ToMotion', 'TorchVisionWrapper', 'Transpose', 'UniformSample', - 'UniformSampleFrames', 'UntrimmedSampleFrames' + 'UniformSampleFrames', 'UntrimmedSampleFrames', 'MMUniformSampleFrames', + 'MMDecode', 'MMCompact' ] diff --git a/mmaction/datasets/transforms/formatting.py b/mmaction/datasets/transforms/formatting.py index 7477f51080..bdcc75ffb5 100644 --- a/mmaction/datasets/transforms/formatting.py +++ b/mmaction/datasets/transforms/formatting.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, Sequence +from typing import Dict, Optional, Sequence, Tuple import numpy as np import torch @@ -38,9 +38,11 @@ class PackActionInputs(BaseTransform): def __init__( self, + collect_keys: Optional[Tuple[str]] = None, meta_keys: Sequence[str] = ('img_shape', 'img_key', 'video_id', 'timestamp') ) -> None: + self.collect_keys = collect_keys self.meta_keys = meta_keys def transform(self, results: Dict) -> Dict: @@ -53,19 +55,27 @@ def transform(self, results: Dict) -> Dict: dict: The result dict. """ packed_results = dict() - if 'imgs' in results: - imgs = results['imgs'] - packed_results['inputs'] = to_tensor(imgs) - elif 'keypoint' in results: - keypoint = results['keypoint'] - packed_results['inputs'] = to_tensor(keypoint) - elif 'audios' in results: - audios = results['audios'] - packed_results['inputs'] = to_tensor(audios) + if self.collect_keys is not None: + packed_results['inputs'] = dict() + for key in self.collect_keys: + packed_results['inputs'][key] = to_tensor(results[key]) else: - raise ValueError( - 'Cannot get `imgs`, `keypoint` or `audios` in the input dict ' - 'of `PackActionInputs`.') + if 'imgs' in results: + imgs = results['imgs'] + packed_results['inputs'] = to_tensor(imgs) + elif 'heatmap_imgs' in results: + heatmap_imgs = results['heatmap_imgs'] + packed_results['inputs'] = to_tensor(heatmap_imgs) + elif 'keypoint' in results: + keypoint = results['keypoint'] + packed_results['inputs'] = to_tensor(keypoint) + elif 'audios' in results: + audios = results['audios'] + packed_results['inputs'] = to_tensor(audios) + else: + raise ValueError( + 'Cannot get `imgs`, `keypoint`, `heatmap_imgs` ' + 'or `audios` in the input dict of `PackActionInputs`.') data_sample = ActionDataSample() @@ -91,7 +101,8 @@ def transform(self, results: Dict) -> Dict: def __repr__(self) -> str: repr_str = self.__class__.__name__ - repr_str += f'(meta_keys={self.meta_keys})' + repr_str += f'(collect_keys={self.collect_keys}, ' + repr_str += f'meta_keys={self.meta_keys})' return repr_str @@ -178,16 +189,20 @@ class FormatShape(BaseTransform): """Format final imgs shape to the given input_format. Required keys: - - imgs + - imgs (optional) + - heatmap_imgs (optional) - num_clips - clip_len Modified Keys: - - img - - input_shape + - imgs (optional) + - input_shape (optional) + + Added Keys: + - heatmap_input_shape (optional) Args: - input_format (str): Define the final imgs format. + input_format (str): Define the final data format. collapse (bool): To collapse input_format N... to ... (NCTHW to CTHW, etc.) if N is 1. Should be set as True when training and testing detectors. Defaults to False. @@ -196,11 +211,13 @@ class FormatShape(BaseTransform): def __init__(self, input_format: str, collapse: bool = False) -> None: self.input_format = input_format self.collapse = collapse - if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']: + if self.input_format not in [ + 'NCTHW', 'NCHW', 'NCHW_Flow', 'NCTHW_Heatmap', 'NPTCHW' + ]: raise ValueError( f'The input format {self.input_format} is invalid.') - def transform(self, results: dict) -> dict: + def transform(self, results: Dict) -> Dict: """Performs the FormatShape formatting. Args: @@ -209,26 +226,69 @@ def transform(self, results: dict) -> dict: """ if not isinstance(results['imgs'], np.ndarray): results['imgs'] = np.array(results['imgs']) - imgs = results['imgs'] + # [M x H x W x C] # M = 1 * N_crops * N_clips * T if self.collapse: assert results['num_clips'] == 1 if self.input_format == 'NCTHW': + if 'imgs' in results: + imgs = results['imgs'] + num_clips = results['num_clips'] + clip_len = results['clip_len'] + if isinstance(clip_len, dict): + clip_len = clip_len['RGB'] + + imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:]) + # N_crops x N_clips x T x H x W x C + imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4)) + # N_crops x N_clips x C x T x H x W + imgs = imgs.reshape((-1, ) + imgs.shape[2:]) + # M' x C x T x H x W + # M' = N_crops x N_clips + results['imgs'] = imgs + results['input_shape'] = imgs.shape + + if 'heatmap_imgs' in results: + imgs = results['heatmap_imgs'] + num_clips = results['num_clips'] + clip_len = results['clip_len'] + # clip_len must be a dict + clip_len = clip_len['Pose'] + + imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:]) + # N_crops x N_clips x T x C x H x W + imgs = np.transpose(imgs, (0, 1, 3, 2, 4, 5)) + # N_crops x N_clips x C x T x H x W + imgs = imgs.reshape((-1, ) + imgs.shape[2:]) + # M' x C x T x H x W + # M' = N_crops x N_clips + results['heatmap_imgs'] = imgs + results['heatmap_input_shape'] = imgs.shape + + elif self.input_format == 'NCTHW_Heatmap': num_clips = results['num_clips'] clip_len = results['clip_len'] + imgs = results['imgs'] imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:]) - # N_crops x N_clips x T x H x W x C - imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4)) + # N_crops x N_clips x T x C x H x W + imgs = np.transpose(imgs, (0, 1, 3, 2, 4, 5)) # N_crops x N_clips x C x T x H x W imgs = imgs.reshape((-1, ) + imgs.shape[2:]) # M' x C x T x H x W # M' = N_crops x N_clips + results['imgs'] = imgs + results['input_shape'] = imgs.shape + elif self.input_format == 'NCHW': + imgs = results['imgs'] imgs = np.transpose(imgs, (0, 3, 1, 2)) # M x C x H x W + results['imgs'] = imgs + results['input_shape'] = imgs.shape + elif self.input_format == 'NCHW_Flow': num_imgs = len(results['imgs']) assert num_imgs % 2 == 0 @@ -252,26 +312,31 @@ def transform(self, results: dict) -> dict: # M' x C' x H x W # M' = N_crops x N_clips # C' = T x C + results['imgs'] = imgs + results['input_shape'] = imgs.shape + elif self.input_format == 'NPTCHW': num_proposals = results['num_proposals'] num_clips = results['num_clips'] clip_len = results['clip_len'] + imgs = results['imgs'] imgs = imgs.reshape((num_proposals, num_clips * clip_len) + imgs.shape[1:]) # P x M x H x W x C # M = N_clips x T imgs = np.transpose(imgs, (0, 1, 4, 2, 3)) # P x M x C x H x W + results['imgs'] = imgs + results['input_shape'] = imgs.shape if self.collapse: - assert imgs.shape[0] == 1 - imgs = imgs.squeeze(0) + assert results['imgs'].shape[0] == 1 + results['imgs'] = results['imgs'].squeeze(0) + results['input_shape'] = results['imgs'].shape - results['imgs'] = imgs - results['input_shape'] = imgs.shape return results - def __repr__(self): + def __repr__(self) -> str: repr_str = self.__class__.__name__ repr_str += f"(input_format='{self.input_format}')" return repr_str diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py index 9e66cd7f3f..10309b2516 100644 --- a/mmaction/datasets/transforms/loading.py +++ b/mmaction/datasets/transforms/loading.py @@ -4,7 +4,7 @@ import os import os.path as osp import shutil -from typing import Optional, Union +from typing import Dict, List, Optional, Union import mmcv import numpy as np @@ -1077,29 +1077,35 @@ class DecordInit(BaseTransform): Decord: https://github.com/dmlc/decord - Required keys are "filename", - added or modified keys are "video_reader" and "total_frames". + Required Keys: + + - filename + + Added Keys: + + - video_reader + - total_frames + - fps Args: io_backend (str): io backend where frames are store. - Default: 'disk'. - num_threads (int): Number of thread to decode the video. Default: 1. + Defaults to ``'disk'``. + num_threads (int): Number of thread to decode the video. Defaults to 1. kwargs (dict): Args for file client. """ - def __init__(self, io_backend='disk', num_threads=1, **kwargs): + def __init__(self, + io_backend: str = 'disk', + num_threads: int = 1, + **kwargs) -> None: self.io_backend = io_backend self.num_threads = num_threads self.kwargs = kwargs self.file_client = None - def transform(self, results): - """Perform the Decord initialization. - - Args: - results (dict): The resulting dict to be modified and passed - to the next transform in pipeline. - """ + def _get_video_reader(self, filename: str) -> object: + if osp.splitext(filename)[0] == filename: + filename = filename + '.mp4' try: import decord except ImportError: @@ -1108,15 +1114,27 @@ def transform(self, results): if self.file_client is None: self.file_client = FileClient(self.io_backend, **self.kwargs) - - file_obj = io.BytesIO(self.file_client.get(results['filename'])) + file_obj = io.BytesIO(self.file_client.get(filename)) container = decord.VideoReader(file_obj, num_threads=self.num_threads) - results['avg_fps'] = container.get_avg_fps() - results['video_reader'] = container + return container + + def transform(self, results: Dict) -> Dict: + """Perform the Decord initialization. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + container = self._get_video_reader(results['filename']) results['total_frames'] = len(container) + + results['video_reader'] = container + results['avg_fps'] = container.get_avg_fps() return results - def __repr__(self): + def __repr__(self) -> str: repr_str = (f'{self.__class__.__name__}(' f'io_backend={self.io_backend}, ' f'num_threads={self.num_threads})') @@ -1129,35 +1147,32 @@ class DecordDecode(BaseTransform): Decord: https://github.com/dmlc/decord - Required keys are "video_reader", "filename" and "frame_inds", - added or modified keys are "imgs" and "original_shape". + Required Keys: + + - video_reader + - frame_inds + + Added Keys: + + - imgs + - original_shape + - img_shape Args: mode (str): Decoding mode. Options are 'accurate' and 'efficient'. If set to 'accurate', it will decode videos into accurate frames. If set to 'efficient', it will adopt fast seeking but only return key frames, which may be duplicated and inaccurate, and more - suitable for large scene-based video datasets. Default: 'accurate'. + suitable for large scene-based video datasets. + Defaults to ``'accurate'``. """ - def __init__(self, mode='accurate'): + def __init__(self, mode: str = 'accurate') -> None: self.mode = mode assert mode in ['accurate', 'efficient'] - def transform(self, results): - """Perform the Decord decoding. - - Args: - results (dict): The resulting dict to be modified and passed - to the next transform in pipeline. - """ - container = results['video_reader'] - - if results['frame_inds'].ndim != 1: - results['frame_inds'] = np.squeeze(results['frame_inds']) - - frame_inds = results['frame_inds'] - + def _decord_load_frames(self, container: object, + frame_inds: np.ndarray) -> List[np.ndarray]: if self.mode == 'accurate': imgs = container.get_batch(frame_inds).asnumpy() imgs = list(imgs) @@ -1169,6 +1184,24 @@ def transform(self, results): container.seek(idx) frame = container.next() imgs.append(frame.asnumpy()) + return imgs + + def transform(self, results: Dict) -> Dict: + """Perform the Decord decoding. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + container = results['video_reader'] + + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + frame_inds = results['frame_inds'] + imgs = self._decord_load_frames(container, frame_inds) results['video_reader'] = None del container @@ -1179,7 +1212,7 @@ def transform(self, results): return results - def __repr__(self): + def __repr__(self) -> str: repr_str = f'{self.__class__.__name__}(mode={self.mode})' return repr_str diff --git a/mmaction/datasets/transforms/pose_transforms.py b/mmaction/datasets/transforms/pose_transforms.py index cff9f90112..76e09dacd8 100644 --- a/mmaction/datasets/transforms/pose_transforms.py +++ b/mmaction/datasets/transforms/pose_transforms.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy as cp import pickle -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple, Union import numpy as np from mmcv.transforms import BaseTransform, KeyMapper @@ -11,7 +11,8 @@ from torch.nn.modules.utils import _pair from mmaction.registry import TRANSFORMS -from .processing import Flip, _combine_quadruple +from .loading import DecordDecode, DecordInit +from .processing import _combine_quadruple @TRANSFORMS.register_module() @@ -172,42 +173,65 @@ def __repr__(self): class GeneratePoseTarget(BaseTransform): """Generate pseudo heatmaps based on joint coordinates and confidence. - Required keys are "keypoint", "img_shape", "keypoint_score" (optional), - added or modified keys are "imgs". + Required Keys: + + - keypoint + - keypoint_score (optional) + - img_shape + + Added Keys: + + - imgs (optional) + - heatmap_imgs (optional) Args: - sigma (float): The sigma of the generated gaussian map. Default: 0.6. + sigma (float): The sigma of the generated gaussian map. + Defaults to 0.6. use_score (bool): Use the confidence score of keypoints as the maximum - of the gaussian maps. Default: True. - with_kp (bool): Generate pseudo heatmaps for keypoints. Default: True. + of the gaussian maps. Defaults to True. + with_kp (bool): Generate pseudo heatmaps for keypoints. + Defaults to True. with_limb (bool): Generate pseudo heatmaps for limbs. At least one of - 'with_kp' and 'with_limb' should be True. Default: False. + 'with_kp' and 'with_limb' should be True. Defaults to False. skeletons (tuple[tuple]): The definition of human skeletons. - Default: ((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), (7, 9), - (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), (13, 15), - (6, 12), (12, 14), (14, 16), (11, 12)), + Defaults to ``((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), + (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), + (13, 15), (6, 12), (12, 14), (14, 16), (11, 12))``, which is the definition of COCO-17p skeletons. double (bool): Output both original heatmaps and flipped heatmaps. - Default: False. + Defaults to False. left_kp (tuple[int]): Indexes of left keypoints, which is used when - flipping heatmaps. Default: (1, 3, 5, 7, 9, 11, 13, 15), + flipping heatmaps. Defaults to (1, 3, 5, 7, 9, 11, 13, 15), which is left keypoints in COCO-17p. right_kp (tuple[int]): Indexes of right keypoints, which is used when - flipping heatmaps. Default: (2, 4, 6, 8, 10, 12, 14, 16), + flipping heatmaps. Defaults to (2, 4, 6, 8, 10, 12, 14, 16), which is right keypoints in COCO-17p. + left_limb (tuple[int]): Indexes of left limbs, which is used when + flipping heatmaps. Defaults to (0, 2, 4, 5, 6, 10, 11, 12), + which is left limbs of skeletons we defined for COCO-17p. + right_limb (tuple[int]): Indexes of right limbs, which is used when + flipping heatmaps. Defaults to (1, 3, 7, 8, 9, 13, 14, 15), + which is right limbs of skeletons we defined for COCO-17p. + scaling (float): The ratio to scale the heatmaps. Defaults to 1. """ def __init__(self, - sigma=0.6, - use_score=True, - with_kp=True, - with_limb=False, - skeletons=((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), - (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), - (13, 15), (6, 12), (12, 14), (14, 16), (11, 12)), - double=False, - left_kp=(1, 3, 5, 7, 9, 11, 13, 15), - right_kp=(2, 4, 6, 8, 10, 12, 14, 16)): + sigma: float = 0.6, + use_score: bool = True, + with_kp: bool = True, + with_limb: bool = False, + skeletons: Tuple[Tuple[int]] = ((0, 1), (0, 2), (1, 3), + (2, 4), (0, 5), (5, 7), + (7, 9), (0, 6), (6, 8), + (8, 10), (5, 11), (11, 13), + (13, 15), (6, 12), (12, 14), + (14, 16), (11, 12)), + double: bool = False, + left_kp: Tuple[int] = (1, 3, 5, 7, 9, 11, 13, 15), + right_kp: Tuple[int] = (2, 4, 6, 8, 10, 12, 14, 16), + left_limb: Tuple[int] = (0, 2, 4, 5, 6, 10, 11, 12), + right_limb: Tuple[int] = (1, 3, 7, 8, 9, 13, 14, 15), + scaling: float = 1.) -> None: self.sigma = sigma self.use_score = use_score @@ -224,29 +248,30 @@ def __init__(self, self.left_kp = left_kp self.right_kp = right_kp self.skeletons = skeletons + self.left_limb = left_limb + self.right_limb = right_limb + self.scaling = scaling - def generate_a_heatmap(self, img_h, img_w, centers, sigma, max_values): + def generate_a_heatmap(self, arr: np.ndarray, centers: np.ndarray, + max_values: np.ndarray) -> None: """Generate pseudo heatmap for one keypoint in one frame. Args: - img_h (int): The height of the heatmap. - img_w (int): The width of the heatmap. + arr (np.ndarray): The array to store the generated heatmaps. + Shape: img_h * img_w. centers (np.ndarray): The coordinates of corresponding keypoints - (of multiple persons). - sigma (float): The sigma of generated gaussian. - max_values (np.ndarray): The max values of each keypoint. - - Returns: - np.ndarray: The generated pseudo heatmap. + (of multiple persons). Shape: M * 2. + max_values (np.ndarray): The max values of each keypoint. Shape: M. """ - heatmap = np.zeros([img_h, img_w], dtype=np.float32) + sigma = self.sigma + img_h, img_w = arr.shape for center, max_value in zip(centers, max_values): - mu_x, mu_y = center[0], center[1] if max_value < self.eps: continue + mu_x, mu_y = center[0], center[1] st_x = max(int(mu_x - 3 * sigma), 0) ed_x = min(int(mu_x + 3 * sigma) + 1, img_w) st_y = max(int(mu_y - 3 * sigma), 0) @@ -261,34 +286,29 @@ def generate_a_heatmap(self, img_h, img_w, centers, sigma, max_values): patch = np.exp(-((x - mu_x)**2 + (y - mu_y)**2) / 2 / sigma**2) patch = patch * max_value - heatmap[st_y:ed_y, - st_x:ed_x] = np.maximum(heatmap[st_y:ed_y, st_x:ed_x], - patch) - - return heatmap + arr[st_y:ed_y, st_x:ed_x] = \ + np.maximum(arr[st_y:ed_y, st_x:ed_x], patch) - def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma, - start_values, end_values): + def generate_a_limb_heatmap(self, arr: np.ndarray, starts: np.ndarray, + ends: np.ndarray, start_values: np.ndarray, + end_values: np.ndarray) -> None: """Generate pseudo heatmap for one limb in one frame. Args: - img_h (int): The height of the heatmap. - img_w (int): The width of the heatmap. + arr (np.ndarray): The array to store the generated heatmaps. + Shape: img_h * img_w. starts (np.ndarray): The coordinates of one keypoint in the - corresponding limbs (of multiple persons). + corresponding limbs. Shape: M * 2. ends (np.ndarray): The coordinates of the other keypoint in the - corresponding limbs (of multiple persons). - sigma (float): The sigma of generated gaussian. + corresponding limbs. Shape: M * 2. start_values (np.ndarray): The max values of one keypoint in the - corresponding limbs. - end_values (np.ndarray): The max values of the other keypoint in - the corresponding limbs. - - Returns: - np.ndarray: The generated pseudo heatmap. + corresponding limbs. Shape: M. + end_values (np.ndarray): The max values of the other keypoint + in the corresponding limbs. Shape: M. """ - heatmap = np.zeros([img_h, img_w], dtype=np.float32) + sigma = self.sigma + img_h, img_w = arr.shape for start, end, start_value, end_value in zip(starts, ends, start_values, @@ -325,9 +345,7 @@ def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma, d2_ab = ((start[0] - end[0])**2 + (start[1] - end[1])**2) if d2_ab < 1: - full_map = self.generate_a_heatmap(img_h, img_w, [start], - sigma, [start_value]) - heatmap = np.maximum(heatmap, full_map) + self.generate_a_heatmap(arr, start[None], start_value[None]) continue coeff = (d2_start - d2_end + d2_ab) / 2. / d2_ab @@ -348,61 +366,50 @@ def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma, patch = np.exp(-d2_seg / 2. / sigma**2) patch = patch * value_coeff - heatmap[min_y:max_y, min_x:max_x] = np.maximum( - heatmap[min_y:max_y, min_x:max_x], patch) - - return heatmap + arr[min_y:max_y, min_x:max_x] = \ + np.maximum(arr[min_y:max_y, min_x:max_x], patch) - def generate_heatmap(self, img_h, img_w, kps, sigma, max_values): + def generate_heatmap(self, arr: np.ndarray, kps: np.ndarray, + max_values: np.ndarray) -> None: """Generate pseudo heatmap for all keypoints and limbs in one frame (if needed). Args: - img_h (int): The height of the heatmap. - img_w (int): The width of the heatmap. + arr (np.ndarray): The array to store the generated heatmaps. + Shape: V * img_h * img_w. kps (np.ndarray): The coordinates of keypoints in this frame. - sigma (float): The sigma of generated gaussian. + Shape: M * V * 2. max_values (np.ndarray): The confidence score of each keypoint. - - Returns: - np.ndarray: The generated pseudo heatmap. + Shape: M * V. """ - heatmaps = [] if self.with_kp: num_kp = kps.shape[1] for i in range(num_kp): - heatmap = self.generate_a_heatmap(img_h, img_w, kps[:, i], - sigma, max_values[:, i]) - heatmaps.append(heatmap) + self.generate_a_heatmap(arr[i], kps[:, i], max_values[:, i]) if self.with_limb: - for limb in self.skeletons: + for i, limb in enumerate(self.skeletons): start_idx, end_idx = limb starts = kps[:, start_idx] ends = kps[:, end_idx] start_values = max_values[:, start_idx] end_values = max_values[:, end_idx] - heatmap = self.generate_a_limb_heatmap(img_h, img_w, starts, - ends, sigma, - start_values, - end_values) - heatmaps.append(heatmap) - - return np.stack(heatmaps, axis=-1) + self.generate_a_limb_heatmap(arr[i], starts, ends, + start_values, end_values) - def gen_an_aug(self, results): + def gen_an_aug(self, results: Dict) -> np.ndarray: """Generate pseudo heatmaps for all frames. Args: results (dict): The dictionary that contains all info of a sample. Returns: - list[np.ndarray]: The generated pseudo heatmaps. + np.ndarray: The generated pseudo heatmaps. """ - all_kps = results['keypoint'] + all_kps = results['keypoint'].astype(np.float32) kp_shape = all_kps.shape if 'keypoint_score' in results: @@ -411,43 +418,54 @@ def gen_an_aug(self, results): all_kpscores = np.ones(kp_shape[:-1], dtype=np.float32) img_h, img_w = results['img_shape'] + + # scale img_h, img_w and kps + img_h = int(img_h * self.scaling + 0.5) + img_w = int(img_w * self.scaling + 0.5) + all_kps[..., :2] *= self.scaling + num_frame = kp_shape[1] + num_c = 0 + if self.with_kp: + num_c += all_kps.shape[2] + if self.with_limb: + num_c += len(self.skeletons) + + ret = np.zeros([num_frame, num_c, img_h, img_w], dtype=np.float32) - imgs = [] for i in range(num_frame): - sigma = self.sigma + # M, V, C kps = all_kps[:, i] - kpscores = all_kpscores[:, i] - - max_values = np.ones(kpscores.shape, dtype=np.float32) - if self.use_score: - max_values = kpscores - - hmap = self.generate_heatmap(img_h, img_w, kps, sigma, max_values) - imgs.append(hmap) + # M, C + kpscores = all_kpscores[:, i] if self.use_score else \ + np.ones_like(all_kpscores[:, i]) - return imgs + self.generate_heatmap(ret[i], kps, kpscores) + return ret - def transform(self, results): + def transform(self, results: Dict) -> Dict: """Generate pseudo heatmaps based on joint coordinates and confidence. Args: results (dict): The resulting dict to be modified and passed to the next transform in pipeline. """ - if not self.double: - results['imgs'] = np.stack(self.gen_an_aug(results)) - else: - results_ = cp.deepcopy(results) - flip = Flip( - flip_ratio=1, left_kp=self.left_kp, right_kp=self.right_kp) - results_ = flip(results_) - results['imgs'] = np.concatenate( - [self.gen_an_aug(results), - self.gen_an_aug(results_)]) + heatmap = self.gen_an_aug(results) + key = 'heatmap_imgs' if 'imgs' in results else 'imgs' + + if self.double: + indices = np.arange(heatmap.shape[1], dtype=np.int64) + left, right = (self.left_kp, self.right_kp) if self.with_kp else ( + self.left_limb, self.right_limb) + for l, r in zip(left, right): # noqa: E741 + indices[l] = r + indices[r] = l + heatmap_flip = heatmap[..., ::-1][:, indices] + heatmap = np.concatenate([heatmap, heatmap_flip]) + results[key] = heatmap return results - def __repr__(self): + def __repr__(self) -> str: repr_str = (f'{self.__class__.__name__}(' f'sigma={self.sigma}, ' f'use_score={self.use_score}, ' @@ -456,7 +474,10 @@ def __repr__(self): f'skeletons={self.skeletons}, ' f'double={self.double}, ' f'left_kp={self.left_kp}, ' - f'right_kp={self.right_kp})') + f'right_kp={self.right_kp}, ' + f'left_limb={self.left_limb}, ' + f'right_limb={self.right_limb}, ' + f'scaling={self.scaling})') return repr_str @@ -468,30 +489,38 @@ class PoseCompact(BaseTransform): example, if 'padding == 0.25', then the expanded box has unchanged center, and 1.25x width and height. - Required keys in results are "img_shape", "keypoint", add or modified keys - are "img_shape", "keypoint", "crop_quadruple". + Required Keys: + + - keypoint + - img_shape + + Modified Keys: + + - img_shape + - keypoint + + Added Keys: + + - crop_quadruple Args: - padding (float): The padding size. Default: 0.25. + padding (float): The padding size. Defaults to 0.25. threshold (int): The threshold for the tight bounding box. If the width or height of the tight bounding box is smaller than the threshold, - we do not perform the compact operation. Default: 10. + we do not perform the compact operation. Defaults to 10. hw_ratio (float | tuple[float] | None): The hw_ratio of the expanded box. Float indicates the specific ratio and tuple indicates a ratio range. If set as None, it means there is no requirement on - hw_ratio. Default: None. + hw_ratio. Defaults to None. allow_imgpad (bool): Whether to allow expanding the box outside the - image to meet the hw_ratio requirement. Default: True. - - Returns: - type: Description of returned object. + image to meet the hw_ratio requirement. Defaults to True. """ def __init__(self, - padding=0.25, - threshold=10, - hw_ratio=None, - allow_imgpad=True): + padding: float = 0.25, + threshold: int = 10, + hw_ratio: Optional[Union[float, Tuple[float]]] = None, + allow_imgpad: bool = True) -> None: self.padding = padding self.threshold = threshold @@ -503,7 +532,7 @@ def __init__(self, self.allow_imgpad = allow_imgpad assert self.padding >= 0 - def transform(self, results): + def transform(self, results: Dict) -> Dict: """Convert the coordinates of keypoints to make it more compact. Args: @@ -561,7 +590,7 @@ def transform(self, results): results['crop_quadruple'] = crop_quadruple return results - def __repr__(self): + def __repr__(self) -> str: repr_str = (f'{self.__class__.__name__}(padding={self.padding}, ' f'threshold={self.threshold}, ' f'hw_ratio={self.hw_ratio}, ' @@ -1167,7 +1196,7 @@ def transform(self, results: Dict) -> Dict: results['num_clips'] = self.num_clips return results - def __repr__(self): + def __repr__(self) -> str: repr_str = (f'{self.__class__.__name__}(' f'clip_len={self.clip_len}, ' f'num_clips={self.num_clips}, ' @@ -1253,6 +1282,17 @@ class PoseDecode(BaseTransform): - keypoint_score (optional) """ + @staticmethod + def _load_kp(kp: np.ndarray, frame_inds: np.ndarray) -> np.ndarray: + """Load keypoints according to sampled indexes.""" + return kp[:, frame_inds].astype(np.float32) + + @staticmethod + def _load_kpscore(kpscore: np.ndarray, + frame_inds: np.ndarray) -> np.ndarray: + """Load keypoint scores according to sampled indexes.""" + return kpscore[:, frame_inds].astype(np.float32) + def transform(self, results: Dict) -> Dict: """The transform function of :class:`PoseDecode`. @@ -1274,16 +1314,256 @@ def transform(self, results: Dict) -> Dict: offset = results.get('offset', 0) frame_inds = results['frame_inds'] + offset - results['keypoint'] = results['keypoint'][:, frame_inds].astype( - np.float32) - if 'keypoint_score' in results: - kpscore = results['keypoint_score'] - results['keypoint_score'] = kpscore[:, - frame_inds].astype(np.float32) + results['keypoint_score'] = self._load_kpscore( + results['keypoint_score'], frame_inds) + + results['keypoint'] = self._load_kp(results['keypoint'], frame_inds) return results def __repr__(self) -> str: repr_str = f'{self.__class__.__name__}()' return repr_str + + +@TRANSFORMS.register_module() +class MMUniformSampleFrames(UniformSampleFrames): + """Uniformly sample frames from the multi-modal data.""" + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`MMUniformSampleFrames`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + num_frames = results['total_frames'] + modalities = [] + for modality, clip_len in self.clip_len.items(): + if self.test_mode: + inds = self._get_test_clips(num_frames, clip_len) + else: + inds = self._get_train_clips(num_frames, clip_len) + inds = np.mod(inds, num_frames) + results[f'{modality}_inds'] = inds.astype(np.int) + modalities.append(modality) + results['clip_len'] = self.clip_len + results['frame_interval'] = None + results['num_clips'] = self.num_clips + if not isinstance(results['modality'], list): + # should override + results['modality'] = modalities + return results + + +@TRANSFORMS.register_module() +class MMDecode(DecordInit, DecordDecode, PoseDecode): + """Decode RGB videos and skeletons.""" + + def __init__(self, io_backend: str = 'disk', **kwargs) -> None: + DecordInit.__init__(self, io_backend=io_backend, **kwargs) + DecordDecode.__init__(self) + self.io_backend = io_backend + self.kwargs = kwargs + self.file_client = None + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`MMDecode`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + for mod in results['modality']: + if results[f'{mod}_inds'].ndim != 1: + results[f'{mod}_inds'] = np.squeeze(results[f'{mod}_inds']) + frame_inds = results[f'{mod}_inds'] + if mod == 'RGB': + if 'filename' not in results: + results['filename'] = results['frame_dir'] + '.mp4' + video_reader = self._get_video_reader(results['filename']) + imgs = self._decord_load_frames(video_reader, frame_inds) + del video_reader + results['imgs'] = imgs + elif mod == 'Pose': + assert 'keypoint' in results + if 'keypoint_score' not in results: + keypoint_score = [ + np.ones(keypoint.shape[:-1], dtype=np.float32) + for keypoint in results['keypoint'] + ] + results['keypoint_score'] = np.stack(keypoint_score) + results['keypoint'] = self._load_kp(results['keypoint'], + frame_inds) + results['keypoint_score'] = self._load_kpscore( + results['keypoint_score'], frame_inds) + else: + raise NotImplementedError( + f'MMDecode: Modality {mod} not supported') + + # We need to scale human keypoints to the new image size + if 'imgs' in results and 'keypoint' in results: + real_img_shape = results['imgs'][0].shape[:2] + if real_img_shape != results['img_shape']: + oh, ow = results['img_shape'] + nh, nw = real_img_shape + + assert results['keypoint'].shape[-1] in [2, 3] + results['keypoint'][..., 0] *= (nw / ow) + results['keypoint'][..., 1] *= (nh / oh) + results['img_shape'] = real_img_shape + results['original_shape'] = real_img_shape + + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(' + f'io_backend={self.io_backend})') + return repr_str + + +@TRANSFORMS.register_module() +class MMCompact(BaseTransform): + """Convert the coordinates of keypoints and crop the images to make them + more compact. + + Required Keys: + + - imgs + - keypoint + - img_shape + + Modified Keys: + + - imgs + - keypoint + - img_shape + + Args: + padding (float): The padding size. Defaults to 0.25. + threshold (int): The threshold for the tight bounding box. If the width + or height of the tight bounding box is smaller than the threshold, + we do not perform the compact operation. Defaults to 10. + hw_ratio (float | tuple[float]): The hw_ratio of the expanded + box. Float indicates the specific ratio and tuple indicates a + ratio range. If set as None, it means there is no requirement on + hw_ratio. Defaults to 1. + allow_imgpad (bool): Whether to allow expanding the box outside the + image to meet the hw_ratio requirement. Defaults to True. + """ + + def __init__(self, + padding: float = 0.25, + threshold: int = 10, + hw_ratio: Union[float, Tuple[float]] = 1, + allow_imgpad: bool = True) -> None: + + self.padding = padding + self.threshold = threshold + if hw_ratio is not None: + hw_ratio = _pair(hw_ratio) + self.hw_ratio = hw_ratio + self.allow_imgpad = allow_imgpad + assert self.padding >= 0 + + def _get_box(self, keypoint: np.ndarray, img_shape: Tuple[int]) -> Tuple: + """Calculate the bounding box surrounding all joints in the frames.""" + h, w = img_shape + + kp_x = keypoint[..., 0] + kp_y = keypoint[..., 1] + + min_x = np.min(kp_x[kp_x != 0], initial=np.Inf) + min_y = np.min(kp_y[kp_y != 0], initial=np.Inf) + max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf) + max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf) + + # The compact area is too small + if max_x - min_x < self.threshold or max_y - min_y < self.threshold: + return 0, 0, w, h + + center = ((max_x + min_x) / 2, (max_y + min_y) / 2) + half_width = (max_x - min_x) / 2 * (1 + self.padding) + half_height = (max_y - min_y) / 2 * (1 + self.padding) + + if self.hw_ratio is not None: + half_height = max(self.hw_ratio[0] * half_width, half_height) + half_width = max(1 / self.hw_ratio[1] * half_height, half_width) + + min_x, max_x = center[0] - half_width, center[0] + half_width + min_y, max_y = center[1] - half_height, center[1] + half_height + + # hot update + if not self.allow_imgpad: + min_x, min_y = int(max(0, min_x)), int(max(0, min_y)) + max_x, max_y = int(min(w, max_x)), int(min(h, max_y)) + else: + min_x, min_y = int(min_x), int(min_y) + max_x, max_y = int(max_x), int(max_y) + return min_x, min_y, max_x, max_y + + def _compact_images(self, imgs: List[np.ndarray], img_shape: Tuple[int], + box: Tuple[int]) -> List: + """Crop the images acoordding the bounding box.""" + h, w = img_shape + min_x, min_y, max_x, max_y = box + pad_l, pad_u, pad_r, pad_d = 0, 0, 0, 0 + if min_x < 0: + pad_l = -min_x + min_x, max_x = 0, max_x + pad_l + w += pad_l + if min_y < 0: + pad_u = -min_y + min_y, max_y = 0, max_y + pad_u + h += pad_u + if max_x > w: + pad_r = max_x - w + w = max_x + if max_y > h: + pad_d = max_y - h + h = max_y + + if pad_l > 0 or pad_r > 0 or pad_u > 0 or pad_d > 0: + imgs = [ + np.pad(img, ((pad_u, pad_d), (pad_l, pad_r), (0, 0))) + for img in imgs + ] + imgs = [img[min_y:max_y, min_x:max_x] for img in imgs] + return imgs + + def transform(self, results: Dict) -> Dict: + """The transform function of :class:`MMCompact`. + + Args: + results (dict): The result dict. + + Returns: + dict: The result dict. + """ + img_shape = results['img_shape'] + kp = results['keypoint'] + # Make NaN zero + kp[np.isnan(kp)] = 0. + min_x, min_y, max_x, max_y = self._get_box(kp, img_shape) + + kp_x, kp_y = kp[..., 0], kp[..., 1] + kp_x[kp_x != 0] -= min_x + kp_y[kp_y != 0] -= min_y + + new_shape = (max_y - min_y, max_x - min_x) + results['img_shape'] = new_shape + results['imgs'] = self._compact_images(results['imgs'], img_shape, + (min_x, min_y, max_x, max_y)) + return results + + def __repr__(self) -> str: + repr_str = (f'{self.__class__.__name__}(padding={self.padding}, ' + f'threshold={self.threshold}, ' + f'hw_ratio={self.hw_ratio}, ' + f'allow_imgpad={self.allow_imgpad})') + return repr_str diff --git a/mmaction/evaluation/metrics/acc_metric.py b/mmaction/evaluation/metrics/acc_metric.py index 488e28aa14..ca6b4623f8 100644 --- a/mmaction/evaluation/metrics/acc_metric.py +++ b/mmaction/evaluation/metrics/acc_metric.py @@ -1,12 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy from collections import OrderedDict -from typing import Any, Optional, Sequence, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union import numpy as np from mmengine.evaluator import BaseMetric -from mmaction.evaluation import (mean_average_precision, mean_class_accuracy, +from mmaction.evaluation import (get_weighted_score, mean_average_precision, + mean_class_accuracy, mmit_mean_average_precision, top_k_accuracy) from mmaction.registry import METRICS @@ -22,7 +23,7 @@ def __init__( Tuple[str]]] = ('top_k_accuracy', 'mean_class_accuracy'), collect_device: str = 'cpu', - metric_options: Optional[dict] = dict( + metric_options: Optional[Dict] = dict( top_k_accuracy=dict(topk=(1, 5))), prefix: Optional[str] = None, num_classes: Optional[int] = None): @@ -56,38 +57,84 @@ def __init__( self.metric_options = metric_options self.num_classes = num_classes - def process(self, data_batch: Sequence[Tuple[Any, dict]], - data_samples: Sequence[dict]) -> None: + def process(self, data_batch: Sequence[Tuple[Any, Dict]], + data_samples: Sequence[Dict]) -> None: """Process one batch of data samples and data_samples. The processed results should be stored in ``self.results``, which will be used to compute the metrics when all batches have been processed. Args: - data_batch (Sequence[Tuple[Any, dict]]): A batch of data - from the dataloader. - data_samples (Sequence[dict]): A batch of outputs from - the model. + data_batch (Sequence[dict]): A batch of data from the dataloader. + data_samples (Sequence[dict]): A batch of outputs from the model. """ + data_samples = copy.deepcopy(data_samples) for data_sample in data_samples: result = dict() pred = data_sample['pred_scores'] label = data_sample['gt_labels'] - result['pred'] = pred['item'].cpu().numpy() + for item_name, score in pred.items(): + pred[item_name] = score.cpu().numpy() + result['pred'] = pred result['label'] = label['item'].item() self.results.append(result) - def compute_metrics(self, results: list) -> dict: + def compute_metrics(self, results: List) -> Dict: """Compute the metrics from processed results. Args: results (list): The processed results of each batch. + Returns: dict: The computed metrics. The keys are the names of the metrics, and the values are corresponding results. """ - preds = [x['pred'] for x in results] labels = [x['label'] for x in results] + if len(results[0]['pred']) == 1: + preds = [x['pred']['item'] for x in results] + return self.calculate(preds, labels) + + eval_results = dict() + for item_name in results[0]['pred'].keys(): + preds = [x['pred'][item_name] for x in results] + eval_result = self.calculate(preds, labels) + eval_results.update( + {f'{item_name}_{k}': v + for k, v in eval_result.items()}) + + # Ad-hoc for RGBPoseConv3D + if len(results[0]['pred']) == 2 and \ + 'rgb' in results[0]['pred'] and \ + 'pose' in results[0]['pred']: + + rgb = [x['pred']['rgb'] for x in results] + pose = [x['pred']['pose'] for x in results] + + preds = { + '1:1': get_weighted_score([rgb, pose], [1, 1]), + '2:1': get_weighted_score([rgb, pose], [2, 1]), + '1:2': get_weighted_score([rgb, pose], [1, 2]) + } + for k in preds: + eval_result = self.calculate(preds[k], labels) + eval_results.update({ + f'RGBPose_{k}_{key}': v + for key, v in eval_result.items() + }) + + return eval_results + + def calculate(self, preds: List[np.ndarray], labels: List[int]) -> Dict: + """Compute the metrics from processed results. + + Args: + preds (list[np.ndarray]): List of the prediction scores. + labels (list[int]): List of the labels. + + Returns: + dict: The computed metrics. The keys are the names of the metrics, + and the values are corresponding results. + """ eval_results = OrderedDict() metric_options = copy.deepcopy(self.metric_options) for metric in self.metrics: diff --git a/mmaction/models/backbones/__init__.py b/mmaction/models/backbones/__init__.py index 066ba18535..2f4eb4a7e3 100644 --- a/mmaction/models/backbones/__init__.py +++ b/mmaction/models/backbones/__init__.py @@ -15,6 +15,7 @@ from .resnet_omni import OmniResNet from .resnet_tin import ResNetTIN from .resnet_tsm import ResNetTSM +from .rgbposeconv3d import RGBPoseConv3D from .stgcn import STGCN from .swin import SwinTransformer3D from .tanet import TANet @@ -29,5 +30,6 @@ 'OmniResNet', 'ResNet', 'ResNet2Plus1d', 'ResNet3d', 'ResNet3dCSN', 'ResNet3dLayer', 'ResNet3dSlowFast', 'ResNet3dSlowOnly', 'ResNetAudio', 'ResNetTIN', 'ResNetTSM', 'STGCN', 'SwinTransformer3D', 'TANet', - 'TimeSformer', 'UniFormer', 'UniFormerV2', 'VisionTransformer', 'X3D' + 'TimeSformer', 'UniFormer', 'UniFormerV2', 'VisionTransformer', 'X3D', + 'RGBPoseConv3D' ] diff --git a/mmaction/models/backbones/resnet3d.py b/mmaction/models/backbones/resnet3d.py index 50435c3064..cbaa4e18ca 100644 --- a/mmaction/models/backbones/resnet3d.py +++ b/mmaction/models/backbones/resnet3d.py @@ -1,22 +1,23 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings from collections import OrderedDict -from typing import List, Optional, Sequence, Tuple, Union +from typing import Dict, List, Optional, Sequence, Tuple, Union +import torch +import torch.nn as nn import torch.utils.checkpoint as cp from mmcv.cnn import ConvModule, NonLocal3d, build_activation_layer from mmengine.logging import MMLogger +from mmengine.model import BaseModule, Sequential from mmengine.model.weight_init import constant_init, kaiming_init from mmengine.runner.checkpoint import _load_checkpoint, load_checkpoint from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm -from torch import Tensor, nn from torch.nn.modules.utils import _ntuple, _triple from mmaction.registry import MODELS -from mmaction.utils import ConfigType, OptConfigType -class BasicBlock3d(nn.Module): +class BasicBlock3d(BaseModule): """BasicBlock 3d block for ResNet3D. Args: @@ -28,22 +29,24 @@ class BasicBlock3d(nn.Module): Defaults to 1. dilation (int): Spacing between kernel elements. Defaults to 1. downsample (nn.Module or None): Downsample layer. Defaults to None. - style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the + style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the stride-two layer is the 3x3 conv layer, otherwise the stride-two - layer is the first 1x1 conv layer. Defaults to ``pytorch``. + layer is the first 1x1 conv layer. Defaults to ``'pytorch'``. inflate (bool): Whether to inflate kernel. Defaults to True. non_local (bool): Determine whether to apply non-local module in this block. Defaults to False. - non_local_cfg (dict or ConfigDict): Config for non-local module. + non_local_cfg (dict): Config for non-local module. Defaults to ``dict()``. - conv_cfg (dict or ConfigDict): Config dict for convolution layer. + conv_cfg (dict): Config dict for convolution layer. Defaults to ``dict(type='Conv3d')``. - norm_cfg (dict or ConfigDict): Config for norm layers. + norm_cfg (dict): Config for norm layers. Required keys are ``type``. Defaults to ``dict(type='BN3d')``. - act_cfg (dict or ConfigDict): Config dict for activation layer. + act_cfg (dict): Config dict for activation layer. Defaults to ``dict(type='ReLU')``. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Defaults to False. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. """ expansion = 1 @@ -57,13 +60,14 @@ def __init__(self, style: str = 'pytorch', inflate: bool = True, non_local: bool = False, - non_local_cfg: ConfigType = dict(), - conv_cfg: ConfigType = dict(type='Conv3d'), - norm_cfg: ConfigType = dict(type='BN3d'), - act_cfg: ConfigType = dict(type='ReLU'), + non_local_cfg: Dict = dict(), + conv_cfg: Dict = dict(type='Conv3d'), + norm_cfg: Dict = dict(type='BN3d'), + act_cfg: Dict = dict(type='ReLU'), with_cp: bool = False, + init_cfg: Optional[Union[Dict, List[Dict]]] = None, **kwargs) -> None: - super().__init__() + super().__init__(init_cfg=init_cfg) assert style in ['pytorch', 'caffe'] # make sure that only ``inflate_style`` is passed into kwargs assert set(kwargs).issubset(['inflate_style']) @@ -130,7 +134,7 @@ def __init__(self, self.non_local_block = NonLocal3d(self.conv2.norm.num_features, **self.non_local_cfg) - def forward(self, x: Tensor) -> Tensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: """Defines the computation performed at every call.""" def _inner_forward(x): @@ -158,7 +162,7 @@ def _inner_forward(x): return out -class Bottleneck3d(nn.Module): +class Bottleneck3d(BaseModule): """Bottleneck 3d block for ResNet3D. Args: @@ -170,25 +174,27 @@ class Bottleneck3d(nn.Module): Defaults to 1. dilation (int): Spacing between kernel elements. Defaults to 1. downsample (nn.Module, optional): Downsample layer. Defaults to None. - style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the + style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the stride-two layer is the 3x3 conv layer, otherwise the stride-two - layer is the first 1x1 conv layer. Defaults to ``pytorch``. + layer is the first 1x1 conv layer. Defaults to ``'pytorch'``. inflate (bool): Whether to inflate kernel. Defaults to True. - inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the + inflate_style (str): '3x1x1' or '3x3x3'. which determines the kernel sizes and padding strides for conv1 and conv2 in each block. - Defaults to ``3x1x1``. + Defaults to ``'3x1x1'``. non_local (bool): Determine whether to apply non-local module in this block. Defaults to False. - non_local_cfg (dict or ConfigDict): Config for non-local module. + non_local_cfg (dict): Config for non-local module. Defaults to ``dict()``. - conv_cfg (dict or ConfigDict): Config dict for convolution layer. + conv_cfg (dict): Config dict for convolution layer. Defaults to ``dict(type='Conv3d')``. - norm_cfg (dict or ConfigDict): Config for norm layers. required + norm_cfg (dict): Config for norm layers. required keys are ``type``. Defaults to ``dict(type='BN3d')``. - act_cfg (dict or ConfigDict): Config dict for activation layer. + act_cfg (dict): Config dict for activation layer. Defaults to ``dict(type='ReLU')``. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Defaults to False. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. """ expansion = 4 @@ -203,12 +209,13 @@ def __init__(self, inflate: bool = True, inflate_style: str = '3x1x1', non_local: bool = False, - non_local_cfg: ConfigType = dict(), - conv_cfg: ConfigType = dict(type='Conv3d'), - norm_cfg: ConfigType = dict(type='BN3d'), - act_cfg: ConfigType = dict(type='ReLU'), - with_cp: bool = False) -> None: - super().__init__() + non_local_cfg: Dict = dict(), + conv_cfg: Dict = dict(type='Conv3d'), + norm_cfg: Dict = dict(type='BN3d'), + act_cfg: Dict = dict(type='ReLU'), + with_cp: bool = False, + init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None: + super().__init__(init_cfg=init_cfg) assert style in ['pytorch', 'caffe'] assert inflate_style in ['3x1x1', '3x3x3'] @@ -297,7 +304,7 @@ def __init__(self, self.non_local_block = NonLocal3d(self.conv3.norm.num_features, **self.non_local_cfg) - def forward(self, x: Tensor) -> Tensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: """Defines the computation performed at every call.""" def _inner_forward(x): @@ -327,23 +334,23 @@ def _inner_forward(x): @MODELS.register_module() -class ResNet3d(nn.Module): +class ResNet3d(BaseModule): """ResNet 3d backbone. Args: - depth (int): Depth of resnet, from - {``18``, ``34``, ``50``, ``101``, ``152``}. + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + Defaults to 50. pretrained (str, optional): Name of pretrained model. Defaults to None. stage_blocks (tuple, optional): Set number of stages for each res layer. Defaults to None. pretrained2d (bool): Whether to load pretrained 2D model. Defaults to True. in_channels (int): Channel num of input features. Defaults to 3. + num_stages (int): Resnet stages. Defaults to 4. base_channels (int): Channel num of stem output features. Defaults to 64. out_indices (Sequence[int]): Indices of output feature. - Defaults to ```(3, )``. - num_stages (int): Resnet stages. Defaults to 4. + Defaults to ``(3, )``. spatial_strides (Sequence[int]): Spatial strides of residual blocks of each stage. Defaults to ``(1, 2, 2, 2)``. @@ -363,9 +370,9 @@ class ResNet3d(nn.Module): pool1_stride_t (int): Temporal stride of the first pooling layer. Defaults to 1. with_pool2 (bool): Whether to use pool2. Defaults to True. - style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the + style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the stride-two layer is the 3x3 conv layer, otherwise the stride-two - layer is the first 1x1 conv layer. Defaults to ``pytorch``. + layer is the first 1x1 conv layer. Defaults to ``'pytorch'``. frozen_stages (int): Stages to be frozen (all param fixed). -1 means not freezing any parameters. Defaults to -1. inflate (Sequence[int]): Inflate Dims of each block. @@ -373,12 +380,12 @@ class ResNet3d(nn.Module): inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the kernel sizes and padding strides for conv1 and conv2 in each block. Defaults to ``3x1x1``. - conv_cfg (dict or ConfigDict): Config for conv layers. + conv_cfg (dict): Config for conv layers. Required keys are ``type``. Defaults to ``dict(type='Conv3d')``. - norm_cfg (dict or ConfigDict): Config for norm layers. + norm_cfg (dict): Config for norm layers. Required keys are ``type`` and ``requires_grad``. Defaults to ``dict(type='BN3d', requires_grad=True)``. - act_cfg (dict or ConfigDict): Config dict for activation layer. + act_cfg (dict): Config dict for activation layer. Defaults to ``dict(type='ReLU', inplace=True)``. norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze running stats (``mean`` and ``var``). Defaults to False. @@ -387,11 +394,13 @@ class ResNet3d(nn.Module): non_local (Sequence[int]): Determine whether to apply non-local module in the corresponding block of each stages. Defaults to ``(0, 0, 0, 0)``. - non_local_cfg (dict or ConfigDict): Config for non-local module. + non_local_cfg (dict): Config for non-local module. Defaults to ``dict()``. zero_init_residual (bool): Whether to use zero initialization for residual block, Defaults to True. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. """ arch_settings = { @@ -403,7 +412,7 @@ class ResNet3d(nn.Module): } def __init__(self, - depth: int, + depth: int = 50, pretrained: Optional[str] = None, stage_blocks: Optional[Tuple] = None, pretrained2d: bool = True, @@ -425,16 +434,17 @@ def __init__(self, frozen_stages: int = -1, inflate: Sequence[int] = (1, 1, 1, 1), inflate_style: str = '3x1x1', - conv_cfg: ConfigType = dict(type='Conv3d'), - norm_cfg: ConfigType = dict(type='BN3d', requires_grad=True), - act_cfg: ConfigType = dict(type='ReLU', inplace=True), + conv_cfg: Dict = dict(type='Conv3d'), + norm_cfg: Dict = dict(type='BN3d', requires_grad=True), + act_cfg: Dict = dict(type='ReLU', inplace=True), norm_eval: bool = False, with_cp: bool = False, non_local: Sequence[int] = (0, 0, 0, 0), - non_local_cfg: ConfigType = dict(), + non_local_cfg: Dict = dict(), zero_init_residual: bool = True, + init_cfg: Optional[Union[Dict, List[Dict]]] = None, **kwargs) -> None: - super().__init__() + super().__init__(init_cfg=init_cfg) if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for resnet') self.depth = depth @@ -486,6 +496,8 @@ def __init__(self, self._make_stem_layer() self.res_layers = [] + lateral_inplanes = getattr(self, 'lateral_inplanes', [0, 0, 0, 0]) + for i, num_blocks in enumerate(self.stage_blocks): spatial_stride = spatial_strides[i] temporal_stride = temporal_strides[i] @@ -493,7 +505,7 @@ def __init__(self, planes = self.base_channels * 2**i res_layer = self.make_res_layer( self.block, - self.inplanes, + self.inplanes + lateral_inplanes[i], planes, num_blocks, spatial_stride=spatial_stride, @@ -514,8 +526,8 @@ def __init__(self, self.add_module(layer_name, res_layer) self.res_layers.append(layer_name) - self.feat_dim = self.block.expansion * self.base_channels * 2**( - len(self.stage_blocks) - 1) + self.feat_dim = self.block.expansion * \ + self.base_channels * 2 ** (len(self.stage_blocks) - 1) @staticmethod def make_res_layer(block: nn.Module, @@ -529,11 +541,11 @@ def make_res_layer(block: nn.Module, inflate: Union[int, Sequence[int]] = 1, inflate_style: str = '3x1x1', non_local: Union[int, Sequence[int]] = 0, - non_local_cfg: ConfigType = dict(), - norm_cfg: OptConfigType = None, - act_cfg: OptConfigType = None, - conv_cfg: OptConfigType = None, - with_cp: Optional[bool] = False, + non_local_cfg: Dict = dict(), + norm_cfg: Optional[Dict] = None, + act_cfg: Optional[Dict] = None, + conv_cfg: Optional[Dict] = None, + with_cp: bool = False, **kwargs) -> nn.Module: """Build residual layer for ResNet3D. @@ -549,25 +561,25 @@ def make_res_layer(block: nn.Module, temporal_stride (int | Sequence[int]): Temporal strides in residual and conv layers. Defaults to 1. dilation (int): Spacing between kernel elements. Defaults to 1. - style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, - the stride-two layer is the 3x3 conv layer, otherwise - the stride-two layer is the first 1x1 conv layer. - Default: ``pytorch``. + style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the + stride-two layer is the 3x3 conv layer,otherwise the + stride-two layer is the first 1x1 conv layer. + Defaults to ``'pytorch'``. inflate (int | Sequence[int]): Determine whether to inflate for each block. Defaults to 1. inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the kernel sizes and padding strides for conv1 and conv2 - in each block. Default: ``3x1x1``. + in each block. Default: ``'3x1x1'``. non_local (int | Sequence[int]): Determine whether to apply non-local module in the corresponding block of each stages. Defaults to 0. non_local_cfg (dict): Config for non-local module. Defaults to ``dict()``. - conv_cfg (dict or ConfigDict, optional): Config for conv layers. + conv_cfg (dict, optional): Config for conv layers. Defaults to None. - norm_cfg (dict or ConfigDict, optional): Config for norm layers. + norm_cfg (dict, optional): Config for norm layers. Defaults to None. - act_cfg (dict or ConfigDict, optional): Config for activate layers. + act_cfg (dict, optional): Config for activate layers. Defaults to None. with_cp (bool, optional): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. @@ -576,10 +588,10 @@ def make_res_layer(block: nn.Module, Returns: nn.Module: A residual layer for the given config. """ - inflate = inflate if not isinstance(inflate, - int) else (inflate, ) * blocks - non_local = non_local if not isinstance( - non_local, int) else (non_local, ) * blocks + inflate = inflate if not isinstance(inflate, int) \ + else (inflate,) * blocks + non_local = non_local if not isinstance(non_local, int) \ + else (non_local,) * blocks assert len(inflate) == blocks and len(non_local) == blocks downsample = None if spatial_stride != 1 or inplanes != planes * block.expansion: @@ -632,7 +644,7 @@ def make_res_layer(block: nn.Module, with_cp=with_cp, **kwargs)) - return nn.Sequential(*layers) + return Sequential(*layers) @staticmethod def _inflate_conv_params(conv3d: nn.Module, state_dict_2d: OrderedDict, @@ -645,7 +657,7 @@ def _inflate_conv_params(conv3d: nn.Module, state_dict_2d: OrderedDict, state_dict_2d (OrderedDict): The state dict of pretrained 2d model. module_name_2d (str): The name of corresponding conv module in the 2d model. - inflated_param_names (List[str]): List of parameters that have been + inflated_param_names (list[str]): List of parameters that have been inflated. """ weight_2d_name = module_name_2d + '.weight' @@ -674,7 +686,7 @@ def _inflate_bn_params(bn3d: nn.Module, state_dict_2d: OrderedDict, state_dict_2d (OrderedDict): The state dict of pretrained 2d model. module_name_2d (str): The name of corresponding bn module in the 2d model. - inflated_param_names (List[str]): List of parameters that have been + inflated_param_names (list[str]): List of parameters that have been inflated. """ for param_name, param in bn3d.named_parameters(): @@ -811,7 +823,7 @@ def _init_weights(self, pretrained: Optional[str] = None) -> None: Args: pretrained (str | None): The path of the pretrained weight. Will override the original `pretrained` if set. The arg is added to - be compatible with mmdet. Default: None. + be compatible with mmdet. Defaults to None. """ if pretrained: self.pretrained = pretrained @@ -822,7 +834,6 @@ def _init_weights(self, pretrained: Optional[str] = None) -> None: if self.pretrained2d: # Inflate 2D model into 3D model. self.inflate_weights(logger) - else: # Directly load 3D model. load_checkpoint( @@ -848,15 +859,16 @@ def init_weights(self, pretrained: Optional[str] = None) -> None: """Initialize weights.""" self._init_weights(self, pretrained) - def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]: + def forward(self, x: torch.Tensor) \ + -> Union[torch.Tensor, Tuple[torch.Tensor]]: """Defines the computation performed at every call. Args: - x (Tensor): The input data. + x (torch.Tensor): The input data. Returns: - Tensor or Tuple[Tensor]: The feature of the input - samples extracted by the backbone. + torch.Tensor or tuple[torch.Tensor]: The feature of the input + samples extracted by the backbone. """ x = self.conv1(x) if self.with_pool1: @@ -885,12 +897,11 @@ def train(self, mode: bool = True) -> None: @MODELS.register_module() -class ResNet3dLayer(nn.Module): +class ResNet3dLayer(BaseModule): """ResNet 3d Layer. Args: - depth (int): Depth of resnet, - from {``18``, ``34``, ``50``, ``101``, ``152``}. + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. pretrained (str, optional): Name of pretrained model. Defaults to None. pretrained2d (bool): Whether to load pretrained 2D model. Defaults to True. @@ -902,20 +913,20 @@ class ResNet3dLayer(nn.Module): temporal_stride (int): The 1st res block's temporal stride. Defaults to 1. dilation (int): The dilation. Defaults to 1. - style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the + style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the stride-two layer is the 3x3 conv layer, otherwise the stride-two - layer is the first 1x1 conv layer. Defaults to ``pytorch``. + layer is the first 1x1 conv layer. Defaults to ``'pytorch'``. all_frozen (bool): Frozen all modules in the layer. Defaults to False. inflate (int): Inflate dims of each block. Defaults to 1. inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the kernel sizes and padding strides for conv1 and conv2 in each block. - Defaults to ``3x1x1``. - conv_cfg (dict or ConfigDict): Config for conv layers. + Defaults to ``'3x1x1'``. + conv_cfg (dict): Config for conv layers. Required keys are ``type``. Defaults to ``dict(type='Conv3d')``. - norm_cfg (dict or ConfigDict): Config for norm layers. + norm_cfg (dict): Config for norm layers. Required keys are ``type`` and ``requires_grad``. Defaults to ``dict(type='BN3d', requires_grad=True)``. - act_cfg (dict or ConfigDict): Config dict for activation layer. + act_cfg (dict): Config dict for activation layer. Defaults to ``dict(type='ReLU', inplace=True)``. norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze running stats (``mean`` and ``var``). Defaults to False. @@ -924,6 +935,8 @@ class ResNet3dLayer(nn.Module): zero_init_residual (bool): Whether to use zero initialization for residual block, Defaults to True. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. """ def __init__(self, @@ -939,14 +952,15 @@ def __init__(self, all_frozen: bool = False, inflate: int = 1, inflate_style: str = '3x1x1', - conv_cfg: ConfigType = dict(type='Conv3d'), - norm_cfg: ConfigType = dict(type='BN3d', requires_grad=True), - act_cfg: ConfigType = dict(type='ReLU', inplace=True), + conv_cfg: Dict = dict(type='Conv3d'), + norm_cfg: Dict = dict(type='BN3d', requires_grad=True), + act_cfg: Dict = dict(type='ReLU', inplace=True), norm_eval: bool = False, with_cp: bool = False, zero_init_residual: bool = True, + init_cfg: Optional[Union[Dict, List[Dict]]] = None, **kwargs) -> None: - super().__init__() + super().__init__(init_cfg=init_cfg) self.arch_settings = ResNet3d.arch_settings assert depth in self.arch_settings @@ -1022,15 +1036,15 @@ def init_weights(self, pretrained: Optional[str] = None) -> None: """Initialize weights.""" self._init_weights(self, pretrained) - def forward(self, x: Tensor) -> Tensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: """Defines the computation performed at every call. Args: - x (Tensor): The input data. + x (torch.Tensor): The input data. Returns: - Tensor: The feature of the input - samples extracted by the resisual layer. + torch.Tensor: The feature of the input + samples extracted by the residual layer. """ res_layer = getattr(self, self.layer_name) out = res_layer(x) diff --git a/mmaction/models/backbones/resnet3d_slowfast.py b/mmaction/models/backbones/resnet3d_slowfast.py index 4417882c4b..c4ca8b8032 100644 --- a/mmaction/models/backbones/resnet3d_slowfast.py +++ b/mmaction/models/backbones/resnet3d_slowfast.py @@ -1,27 +1,88 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings from collections import OrderedDict -from typing import List, Optional, Sequence, Union +from typing import Dict, List, Optional, Tuple, Union import torch import torch.nn as nn from mmcv.cnn import ConvModule from mmengine.logging import MMLogger, print_log +from mmengine.model import BaseModule from mmengine.model.weight_init import kaiming_init from mmengine.runner.checkpoint import _load_checkpoint, load_checkpoint -from torch import Tensor from mmaction.registry import MODELS -from mmaction.utils import ConfigType, OptConfigType from .resnet3d import ResNet3d +class DeConvModule(BaseModule): + """A deconv module that bundles deconv/norm/activation layers. + + Args: + in_channels (int): Number of channels in the input feature map. + out_channels (int): Number of channels produced by the convolution. + kernel_size (int | tuple[int]): Size of the convolving kernel. + stride (int | tuple[int]): Stride of the convolution. + padding (int | tuple[int]): Zero-padding added to both sides of + the input. + bias (bool): Whether to add a learnable bias to the output. + Defaults to False. + with_bn (bool): Whether to add a BN layer. Defaults to True. + with_relu (bool): Whether to add a ReLU layer. Defaults to True. + """ + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: Union[int, Tuple[int]] = (1, 1, 1), + padding: Union[int, Tuple[int]] = 0, + bias: bool = False, + with_bn: bool = True, + with_relu: bool = True) -> None: + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.bias = bias + self.with_bn = with_bn + self.with_relu = with_relu + + self.conv = nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + bias=bias) + self.bn = nn.BatchNorm3d(out_channels) + self.relu = nn.ReLU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Defines the computation performed at every call.""" + # x should be a 5-d tensor + assert len(x.shape) == 5 + N, C, T, H, W = x.shape + out_shape = (N, self.out_channels, self.stride[0] * T, + self.stride[1] * H, self.stride[2] * W) + x = self.conv(x, output_size=out_shape) + if self.with_bn: + x = self.bn(x) + if self.with_relu: + x = self.relu(x) + return x + + class ResNet3dPathway(ResNet3d): """A pathway of Slowfast based on ResNet3d. Args: lateral (bool): Determines whether to enable the lateral connection from another pathway. Defaults to False. + lateral_inv (bool): Whether to use deconv to upscale the time + dimension of features from another pathway. Defaults to False. lateral_norm (bool): Determines whether to enable the lateral norm in lateral layers. Defaults to False. speed_ratio (int): Speed ratio indicating the ratio between time @@ -32,181 +93,112 @@ class ResNet3dPathway(ResNet3d): Defaults to 8. fusion_kernel (int): The kernel size of lateral fusion. Defaults to 5. + lateral_infl (int): The ratio of the inflated channels. + Defaults to 2. + lateral_activate (list[int]): Flags for activating the lateral + connection. Defaults to ``[1, 1, 1, 1]``. """ def __init__(self, - *args, lateral: bool = False, + lateral_inv: bool = False, lateral_norm: bool = False, speed_ratio: int = 8, channel_ratio: int = 8, fusion_kernel: int = 5, + lateral_infl: int = 2, + lateral_activate: List[int] = [1, 1, 1, 1], **kwargs) -> None: self.lateral = lateral + self.lateral_inv = lateral_inv self.lateral_norm = lateral_norm self.speed_ratio = speed_ratio self.channel_ratio = channel_ratio self.fusion_kernel = fusion_kernel - super().__init__(*args, **kwargs) + self.lateral_infl = lateral_infl + self.lateral_activate = lateral_activate + self._calculate_lateral_inplanes(kwargs) + + super().__init__(**kwargs) self.inplanes = self.base_channels - if self.lateral: - self.conv1_lateral = ConvModule( - self.inplanes // self.channel_ratio, - # https://arxiv.org/abs/1812.03982, the - # third type of lateral connection has out_channel: - # 2 * \beta * C - self.inplanes * 2 // self.channel_ratio, - kernel_size=(fusion_kernel, 1, 1), - stride=(self.speed_ratio, 1, 1), - padding=((fusion_kernel - 1) // 2, 0, 0), - bias=False, - conv_cfg=self.conv_cfg, - norm_cfg=self.norm_cfg if self.lateral_norm else None, - act_cfg=self.act_cfg if self.lateral_norm else None) + if self.lateral and self.lateral_activate[0] == 1: + if self.lateral_inv: + self.conv1_lateral = DeConvModule( + self.inplanes * self.channel_ratio, + self.inplanes * self.channel_ratio // lateral_infl, + kernel_size=(fusion_kernel, 1, 1), + stride=(self.speed_ratio, 1, 1), + padding=((fusion_kernel - 1) // 2, 0, 0), + with_bn=True, + with_relu=True) + else: + self.conv1_lateral = ConvModule( + self.inplanes // self.channel_ratio, + self.inplanes * lateral_infl // self.channel_ratio, + kernel_size=(fusion_kernel, 1, 1), + stride=(self.speed_ratio, 1, 1), + padding=((fusion_kernel - 1) // 2, 0, 0), + bias=False, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg if self.lateral_norm else None, + act_cfg=self.act_cfg if self.lateral_norm else None) self.lateral_connections = [] for i in range(len(self.stage_blocks)): planes = self.base_channels * 2**i self.inplanes = planes * self.block.expansion - if lateral and i != self.num_stages - 1: + if lateral and i != self.num_stages - 1 \ + and self.lateral_activate[i + 1]: # no lateral connection needed in final stage lateral_name = f'layer{(i + 1)}_lateral' - setattr( - self, lateral_name, - ConvModule( + if self.lateral_inv: + conv_module = DeConvModule( + self.inplanes * self.channel_ratio, + self.inplanes * self.channel_ratio // lateral_infl, + kernel_size=(fusion_kernel, 1, 1), + stride=(self.speed_ratio, 1, 1), + padding=((fusion_kernel - 1) // 2, 0, 0), + bias=False, + with_bn=True, + with_relu=True) + else: + conv_module = ConvModule( self.inplanes // self.channel_ratio, - self.inplanes * 2 // self.channel_ratio, + self.inplanes * lateral_infl // self.channel_ratio, kernel_size=(fusion_kernel, 1, 1), stride=(self.speed_ratio, 1, 1), padding=((fusion_kernel - 1) // 2, 0, 0), bias=False, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg if self.lateral_norm else None, - act_cfg=self.act_cfg if self.lateral_norm else None)) + act_cfg=self.act_cfg if self.lateral_norm else None) + setattr(self, lateral_name, conv_module) self.lateral_connections.append(lateral_name) - def make_res_layer(self, - block: nn.Module, - inplanes: int, - planes: int, - blocks: int, - spatial_stride: Union[int, Sequence[int]] = 1, - temporal_stride: Union[int, Sequence[int]] = 1, - dilation: int = 1, - style: str = 'pytorch', - inflate: Union[int, Sequence[int]] = 1, - inflate_style: str = '3x1x1', - non_local: Union[int, Sequence[int]] = 0, - non_local_cfg: ConfigType = dict(), - norm_cfg: OptConfigType = None, - act_cfg: OptConfigType = None, - conv_cfg: OptConfigType = None, - with_cp: Optional[bool] = False, - **kwargs) -> nn.Module: - """Build residual layer for SlowFast. - - Args: - block (nn.Module): Residual module to be built. - inplanes (int): Number of channels for the input feature - in each block. - planes (int): Number of channels for the output feature - in each block. - blocks (int): Number of residual blocks. - spatial_stride (int | Sequence[int]): Spatial strides in - residual and conv layers. Defaults to 1. - temporal_stride (int | Sequence[int]): Temporal strides in - residual and conv layers. Defaults to 1. - dilation (int): Spacing between kernel elements. Defaults to 1. - style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, - the stride-two layer is the 3x3 conv layer, otherwise - the stride-two layer is the first 1x1 conv layer. - Default: ``pytorch``. - inflate (int | Sequence[int]): Determine whether to inflate - for each block. Defaults to 1. - inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines - the kernel sizes and padding strides for conv1 and conv2 - in each block. Default: ``3x1x1``. - non_local (int | Sequence[int]): Determine whether to apply - non-local module in the corresponding block of each stages. - Defaults to 0. - non_local_cfg (dict): Config for non-local module. - Defaults to ``dict()``. - conv_cfg (dict or ConfigDict, optional): Config for conv layers. - Defaults to None. - norm_cfg (dict or ConfigDict, optional): Config for norm layers. - Defaults to None. - act_cfg (dict or ConfigDict, optional): Config for activate layers. - Defaults to None. - with_cp (bool, optional): Use checkpoint or not. Using checkpoint - will save some memory while slowing down the training speed. - Defaults to False. - - Returns: - nn.Module: A residual layer for the given config. - """ - inflate = inflate if not isinstance(inflate, - int) else (inflate, ) * blocks - non_local = non_local if not isinstance( - non_local, int) else (non_local, ) * blocks - assert len(inflate) == blocks and len(non_local) == blocks - if self.lateral: - lateral_inplanes = inplanes * 2 // self.channel_ratio - else: - lateral_inplanes = 0 - if (spatial_stride != 1 - or (inplanes + lateral_inplanes) != planes * block.expansion): - downsample = ConvModule( - inplanes + lateral_inplanes, - planes * block.expansion, - kernel_size=1, - stride=(temporal_stride, spatial_stride, spatial_stride), - bias=False, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=None) - else: - downsample = None - - layers = [] - layers.append( - block( - inplanes + lateral_inplanes, - planes, - spatial_stride, - temporal_stride, - dilation, - downsample, - style=style, - inflate=(inflate[0] == 1), - inflate_style=inflate_style, - non_local=(non_local[0] == 1), - non_local_cfg=non_local_cfg, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg, - with_cp=with_cp)) - inplanes = planes * block.expansion - - for i in range(1, blocks): - layers.append( - block( - inplanes, - planes, - 1, - 1, - dilation, - style=style, - inflate=(inflate[i] == 1), - inflate_style=inflate_style, - non_local=(non_local[i] == 1), - non_local_cfg=non_local_cfg, - conv_cfg=conv_cfg, - norm_cfg=norm_cfg, - act_cfg=act_cfg, - with_cp=with_cp)) - - return nn.Sequential(*layers) + def _calculate_lateral_inplanes(self, kwargs): + """Calculate inplanes for lateral connection.""" + depth = kwargs.get('depth', 50) + expansion = 1 if depth < 50 else 4 + base_channels = kwargs.get('base_channels', 64) + lateral_inplanes = [] + for i in range(kwargs.get('num_stages', 4)): + if expansion % 2 == 0: + planes = base_channels * (2 ** i) * \ + ((expansion // 2) ** (i > 0)) + else: + planes = base_channels * (2**i) // (2**(i > 0)) + if self.lateral and self.lateral_activate[i]: + if self.lateral_inv: + lateral_inplane = planes * \ + self.channel_ratio // self.lateral_infl + else: + lateral_inplane = planes * \ + self.lateral_infl // self.channel_ratio + else: + lateral_inplane = 0 + lateral_inplanes.append(lateral_inplane) + self.lateral_inplanes = lateral_inplanes def inflate_weights(self, logger: MMLogger) -> None: """Inflate the resnet2d parameters to resnet3d pathway. @@ -280,7 +272,7 @@ def _inflate_conv_params(self, conv3d: nn.Module, state_dict_2d (OrderedDict): The state dict of pretrained 2d model. module_name_2d (str): The name of corresponding conv module in the 2d model. - inflated_param_names (List[str]): List of parameters that have been + inflated_param_names (list[str]): List of parameters that have been inflated. """ weight_2d_name = module_name_2d + '.weight' @@ -358,11 +350,11 @@ def init_weights(self, pretrained: Optional[str] = None) -> None: } -def build_pathway(cfg: ConfigType, *args, **kwargs) -> nn.Module: +def build_pathway(cfg: Dict, *args, **kwargs) -> nn.Module: """Build pathway. Args: - cfg (dict or ConfigDict): cfg should contain: + cfg (dict): cfg should contain: - type (str): identify backbone type. Returns: @@ -383,7 +375,7 @@ def build_pathway(cfg: ConfigType, *args, **kwargs) -> nn.Module: @MODELS.register_module() -class ResNet3dSlowFast(nn.Module): +class ResNet3dSlowFast(BaseModule): """Slowfast backbone. This module is proposed in `SlowFast Networks for Video Recognition @@ -403,57 +395,43 @@ class ResNet3dSlowFast(nn.Module): channel_ratio (int): Reduce the channel number of fast pathway by ``channel_ratio``, corresponding to :math:`\\beta` in the paper. Defaults to 8. - slow_pathway (dict or ConfigDict): Configuration of slow branch, should - contain necessary arguments for building the specific type of - pathway and: - type (str): type of backbone the pathway bases on. - lateral (bool): determine whether to build lateral connection - for the pathway. Defaults to - - .. code-block:: Python - - dict(type='ResNetPathway', - lateral=True, depth=50, pretrained=None, - conv1_kernel=(1, 7, 7), dilations=(1, 1, 1, 1), - conv1_stride_t=1, pool1_stride_t=1, inflate=(0, 0, 1, 1)) - - fast_pathway (dict or ConfigDict): Configuration of fast branch, - similar to ``slow_pathway``. Defaults to - - .. code-block:: Python - - dict(type='ResNetPathway', - lateral=False, depth=50, pretrained=None, base_channels=8, - conv1_kernel=(5, 7, 7), conv1_stride_t=1, pool1_stride_t=1) + slow_pathway (dict): Configuration of slow branch. Defaults to + ``dict(type='resnet3d', depth=50, pretrained=None, lateral=True, + conv1_kernel=(1, 7, 7), conv1_stride_t=1, pool1_stride_t=1, + inflate=(0, 0, 1, 1))``. + fast_pathway (dict): Configuration of fast branch. Defaults to + ``dict(type='resnet3d', depth=50, pretrained=None, lateral=False, + base_channels=8, conv1_kernel=(5, 7, 7), conv1_stride_t=1, + pool1_stride_t=1)``. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. """ - def __init__( - self, - pretrained, - resample_rate: int = 8, - speed_ratio: int = 8, - channel_ratio: int = 8, - slow_pathway: ConfigType = dict( - type='resnet3d', - depth=50, - pretrained=None, - lateral=True, - conv1_kernel=(1, 7, 7), - dilations=(1, 1, 1, 1), - conv1_stride_t=1, - pool1_stride_t=1, - inflate=(0, 0, 1, 1)), - fast_pathway: ConfigType = dict( - type='resnet3d', - depth=50, - pretrained=None, - lateral=False, - base_channels=8, - conv1_kernel=(5, 7, 7), - conv1_stride_t=1, - pool1_stride_t=1) - ) -> None: - super().__init__() + def __init__(self, + pretrained: Optional[str] = None, + resample_rate: int = 8, + speed_ratio: int = 8, + channel_ratio: int = 8, + slow_pathway: Dict = dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1)), + fast_pathway: Dict = dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1), + init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None: + super().__init__(init_cfg=init_cfg) self.pretrained = pretrained self.resample_rate = resample_rate self.speed_ratio = speed_ratio @@ -485,15 +463,15 @@ def init_weights(self, pretrained: Optional[str] = None) -> None: else: raise TypeError('pretrained must be a str or None') - def forward(self, x: Tensor) -> tuple: + def forward(self, x: torch.Tensor) -> tuple: """Defines the computation performed at every call. Args: - x (Tensor): The input data. + x (torch.Tensor): The input data. Returns: - Tuple[Tensor]: The feature of the input samples extracted - by the backbone. + tuple[torch.Tensor]: The feature of the input samples + extracted by the backbone. """ x_slow = nn.functional.interpolate( x, diff --git a/mmaction/models/backbones/resnet3d_slowonly.py b/mmaction/models/backbones/resnet3d_slowonly.py index 819063c0cd..3a2a3a3ac0 100644 --- a/mmaction/models/backbones/resnet3d_slowonly.py +++ b/mmaction/models/backbones/resnet3d_slowonly.py @@ -16,8 +16,6 @@ class ResNet3dSlowOnly(ResNet3dPathway): """SlowOnly backbone based on ResNet3dPathway. Args: - lateral (bool): Determines whether to enable the lateral connection - from another pathway. Defaults to False. conv1_kernel (Sequence[int]): Kernel size of the first conv layer. Defaults to ``(1, 7, 7)``. conv1_stride_t (int): Temporal stride of the first conv layer. @@ -30,8 +28,6 @@ class ResNet3dSlowOnly(ResNet3dPathway): """ def __init__(self, - *args, - lateral: bool = False, conv1_kernel: Sequence[int] = (1, 7, 7), conv1_stride_t: int = 1, pool1_stride_t: int = 1, @@ -39,8 +35,6 @@ def __init__(self, with_pool2: bool = False, **kwargs) -> None: super().__init__( - *args, - lateral=lateral, conv1_kernel=conv1_kernel, conv1_stride_t=conv1_stride_t, pool1_stride_t=pool1_stride_t, diff --git a/mmaction/models/backbones/rgbposeconv3d.py b/mmaction/models/backbones/rgbposeconv3d.py new file mode 100644 index 0000000000..6f54e3b6b5 --- /dev/null +++ b/mmaction/models/backbones/rgbposeconv3d.py @@ -0,0 +1,205 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Optional, Union + +import torch +import torch.nn as nn +from mmengine.logging import MMLogger, print_log +from mmengine.model import BaseModule +from mmengine.model.weight_init import constant_init, kaiming_init +from mmengine.runner.checkpoint import load_checkpoint +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm + +from mmaction.registry import MODELS +from .resnet3d_slowfast import ResNet3dPathway + + +@MODELS.register_module() +class RGBPoseConv3D(BaseModule): + """RGBPoseConv3D backbone. + + Args: + pretrained (str): The file path to a pretrained model. + Defaults to None. + speed_ratio (int): Speed ratio indicating the ratio between time + dimension of the fast and slow pathway, corresponding to the + :math:`\\alpha` in the paper. Defaults to 4. + channel_ratio (int): Reduce the channel number of fast pathway + by ``channel_ratio``, corresponding to :math:`\\beta` in the paper. + Defaults to 4. + rgb_detach (bool): Whether to detach the gradients from the pose path. + Defaults to False. + pose_detach (bool): Whether to detach the gradients from the rgb path. + Defaults to False. + rgb_drop_path (float): The drop rate for dropping the features from + the pose path. Defaults to 0. + pose_drop_path (float): The drop rate for dropping the features from + the rgb path. Defaults to 0. + rgb_pathway (dict): Configuration of rgb branch. Defaults to + ``dict(num_stages=4, lateral=True, lateral_infl=1, + lateral_activate=(0, 0, 1, 1), fusion_kernel=7, base_channels=64, + conv1_kernel=(1, 7, 7), inflate=(0, 0, 1, 1), with_pool2=False)``. + pose_pathway (dict): Configuration of pose branch. Defaults to + ``dict(num_stages=3, stage_blocks=(4, 6, 3), lateral=True, + lateral_inv=True, lateral_infl=16, lateral_activate=(0, 1, 1), + fusion_kernel=7, in_channels=17, base_channels=32, + out_indices=(2, ), conv1_kernel=(1, 7, 7), conv1_stride_s=1, + conv1_stride_t=1, pool1_stride_s=1, pool1_stride_t=1, + inflate=(0, 1, 1), spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 1), with_pool2=False)``. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + pretrained: Optional[str] = None, + speed_ratio: int = 4, + channel_ratio: int = 4, + rgb_detach: bool = False, + pose_detach: bool = False, + rgb_drop_path: float = 0, + pose_drop_path: float = 0, + rgb_pathway: Dict = dict( + num_stages=4, + lateral=True, + lateral_infl=1, + lateral_activate=(0, 0, 1, 1), + fusion_kernel=7, + base_channels=64, + conv1_kernel=(1, 7, 7), + inflate=(0, 0, 1, 1), + with_pool2=False), + pose_pathway: Dict = dict( + num_stages=3, + stage_blocks=(4, 6, 3), + lateral=True, + lateral_inv=True, + lateral_infl=16, + lateral_activate=(0, 1, 1), + fusion_kernel=7, + in_channels=17, + base_channels=32, + out_indices=(2, ), + conv1_kernel=(1, 7, 7), + conv1_stride_s=1, + conv1_stride_t=1, + pool1_stride_s=1, + pool1_stride_t=1, + inflate=(0, 1, 1), + spatial_strides=(2, 2, 2), + temporal_strides=(1, 1, 1), + dilations=(1, 1, 1), + with_pool2=False), + init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None: + super().__init__(init_cfg=init_cfg) + self.pretrained = pretrained + self.speed_ratio = speed_ratio + self.channel_ratio = channel_ratio + + if rgb_pathway['lateral']: + rgb_pathway['speed_ratio'] = speed_ratio + rgb_pathway['channel_ratio'] = channel_ratio + + if pose_pathway['lateral']: + pose_pathway['speed_ratio'] = speed_ratio + pose_pathway['channel_ratio'] = channel_ratio + + self.rgb_path = ResNet3dPathway(**rgb_pathway) + self.pose_path = ResNet3dPathway(**pose_pathway) + self.rgb_detach = rgb_detach + self.pose_detach = pose_detach + assert 0 <= rgb_drop_path <= 1 + assert 0 <= pose_drop_path <= 1 + self.rgb_drop_path = rgb_drop_path + self.pose_drop_path = pose_drop_path + + def init_weights(self) -> None: + """Initiate the parameters either from existing checkpoint or from + scratch.""" + for m in self.modules(): + if isinstance(m, nn.Conv3d): + kaiming_init(m) + elif isinstance(m, _BatchNorm): + constant_init(m, 1) + + if isinstance(self.pretrained, str): + logger = MMLogger.get_current_instance() + msg = f'load model from: {self.pretrained}' + print_log(msg, logger=logger) + load_checkpoint(self, self.pretrained, strict=True, logger=logger) + elif self.pretrained is None: + # Init two branch separately. + self.rgb_path.init_weights() + self.pose_path.init_weights() + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, imgs: torch.Tensor, heatmap_imgs: torch.Tensor) -> tuple: + """Defines the computation performed at every call. + + Args: + imgs (torch.Tensor): The input data. + heatmap_imgs (torch.Tensor): The input data. + + Returns: + tuple[torch.Tensor]: The feature of the input + samples extracted by the backbone. + """ + if self.training: + rgb_drop_path = torch.rand(1) < self.rgb_drop_path + pose_drop_path = torch.rand(1) < self.pose_drop_path + else: + rgb_drop_path, pose_drop_path = False, False + # We assume base_channel for RGB and Pose are 64 and 32. + x_rgb = self.rgb_path.conv1(imgs) + x_rgb = self.rgb_path.maxpool(x_rgb) + # N x 64 x 8 x 56 x 56 + x_pose = self.pose_path.conv1(heatmap_imgs) + x_pose = self.pose_path.maxpool(x_pose) + + x_rgb = self.rgb_path.layer1(x_rgb) + x_rgb = self.rgb_path.layer2(x_rgb) + x_pose = self.pose_path.layer1(x_pose) + + if hasattr(self.rgb_path, 'layer2_lateral'): + feat = x_pose.detach() if self.rgb_detach else x_pose + x_pose_lateral = self.rgb_path.layer2_lateral(feat) + if rgb_drop_path: + x_pose_lateral = x_pose_lateral.new_zeros(x_pose_lateral.shape) + + if hasattr(self.pose_path, 'layer1_lateral'): + feat = x_rgb.detach() if self.pose_detach else x_rgb + x_rgb_lateral = self.pose_path.layer1_lateral(feat) + if pose_drop_path: + x_rgb_lateral = x_rgb_lateral.new_zeros(x_rgb_lateral.shape) + + if hasattr(self.rgb_path, 'layer2_lateral'): + x_rgb = torch.cat((x_rgb, x_pose_lateral), dim=1) + + if hasattr(self.pose_path, 'layer1_lateral'): + x_pose = torch.cat((x_pose, x_rgb_lateral), dim=1) + + x_rgb = self.rgb_path.layer3(x_rgb) + x_pose = self.pose_path.layer2(x_pose) + + if hasattr(self.rgb_path, 'layer3_lateral'): + feat = x_pose.detach() if self.rgb_detach else x_pose + x_pose_lateral = self.rgb_path.layer3_lateral(feat) + if rgb_drop_path: + x_pose_lateral = x_pose_lateral.new_zeros(x_pose_lateral.shape) + + if hasattr(self.pose_path, 'layer2_lateral'): + feat = x_rgb.detach() if self.pose_detach else x_rgb + x_rgb_lateral = self.pose_path.layer2_lateral(feat) + if pose_drop_path: + x_rgb_lateral = x_rgb_lateral.new_zeros(x_rgb_lateral.shape) + + if hasattr(self.rgb_path, 'layer3_lateral'): + x_rgb = torch.cat((x_rgb, x_pose_lateral), dim=1) + + if hasattr(self.pose_path, 'layer2_lateral'): + x_pose = torch.cat((x_pose, x_rgb_lateral), dim=1) + + x_rgb = self.rgb_path.layer4(x_rgb) + x_pose = self.pose_path.layer3(x_pose) + + return x_rgb, x_pose diff --git a/mmaction/models/data_preprocessors/__init__.py b/mmaction/models/data_preprocessors/__init__.py index feccb87e2b..241f9b901a 100644 --- a/mmaction/models/data_preprocessors/__init__.py +++ b/mmaction/models/data_preprocessors/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from .data_preprocessor import ActionDataPreprocessor +from .multimodal_data_preprocessor import MultiModalDataPreprocessor -__all__ = ['ActionDataPreprocessor'] +__all__ = ['ActionDataPreprocessor', 'MultiModalDataPreprocessor'] diff --git a/mmaction/models/data_preprocessors/data_preprocessor.py b/mmaction/models/data_preprocessors/data_preprocessor.py index d2641bb6ab..5a11eefd3b 100644 --- a/mmaction/models/data_preprocessors/data_preprocessor.py +++ b/mmaction/models/data_preprocessors/data_preprocessor.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional, Sequence, Tuple, Union +from typing import List, Optional, Sequence, Tuple, Union import torch from mmengine.model import BaseDataPreprocessor, stack_batch from mmaction.registry import MODELS +from mmaction.utils.typing import SampleList @MODELS.register_module() @@ -12,13 +13,10 @@ class ActionDataPreprocessor(BaseDataPreprocessor): """Data pre-processor for action recognition tasks. Args: - mean (Sequence[float or int, optional): The pixel mean of channels + mean (Sequence[float or int], optional): The pixel mean of channels of images or stacked optical flow. Defaults to None. std (Sequence[float or int], optional): The pixel standard deviation of channels of images or stacked optical flow. Defaults to None. - pad_size_divisor (int): The size of padded image should be - divisible by ``pad_size_divisor``. Defaults to 1. - pad_value (float or int): The padded pixel value. Defaults to 0. to_rgb (bool): Whether to convert image from BGR to RGB. Defaults to False. blending (dict, optional): Config for batch blending. @@ -30,14 +28,10 @@ class ActionDataPreprocessor(BaseDataPreprocessor): def __init__(self, mean: Optional[Sequence[Union[float, int]]] = None, std: Optional[Sequence[Union[float, int]]] = None, - pad_size_divisor: int = 1, - pad_value: Union[float, int] = 0, to_rgb: bool = False, blending: Optional[dict] = None, format_shape: str = 'NCHW') -> None: super().__init__() - self.pad_size_divisor = pad_size_divisor - self.pad_value = pad_value self.to_rgb = to_rgb self.format_shape = format_shape @@ -49,7 +43,7 @@ def __init__(self, self._enable_normalize = True if self.format_shape == 'NCHW': normalizer_shape = (-1, 1, 1) - elif self.format_shape in ['NCTHW', 'NCTVM', 'MIX2d3d']: + elif self.format_shape in ['NCTHW', 'MIX2d3d']: normalizer_shape = (-1, 1, 1, 1) else: raise ValueError(f'Invalid format shape: {format_shape}') @@ -81,21 +75,21 @@ def forward(self, training (bool): Whether to enable training time augmentation. Returns: - dict or Tuple[dict]: Data in the same format as the model - input. + dict or Tuple[dict]: Data in the same format as the model input. """ + data = self.cast_data(data) if isinstance(data, dict): - return self.forward_onesample(data, training) + return self.forward_onesample(data, training=training) elif isinstance(data, tuple): outputs = [] for data_sample in data: - output = self.forward_onesample(data_sample, training) + output = self.forward_onesample(data_sample, training=training) outputs.append(output) return tuple(outputs) else: - raise TypeError('Unsupported data type for `data`!') + raise TypeError(f'Unsupported data type: {type(data)}!') - def forward_onesample(self, data: dict, training: bool = False) -> dict: + def forward_onesample(self, data, training: bool = False) -> dict: """Perform normalization, padding, bgr2rgb conversion and batch augmentation on one data sample. @@ -107,12 +101,18 @@ def forward_onesample(self, data: dict, training: bool = False) -> dict: dict: Data in the same format as the model input. """ - data = self.cast_data(data) inputs, data_samples = data['inputs'], data['data_samples'] + inputs, data_samples = self.preprocess(inputs, data_samples, training) + data['inputs'] = inputs + data['data_samples'] = data_samples + return data + def preprocess(self, + inputs: List[torch.Tensor], + data_samples: SampleList, + training: bool = False) -> Tuple: # --- Pad and stack -- - batch_inputs = stack_batch(inputs, self.pad_size_divisor, - self.pad_value) + batch_inputs = stack_batch(inputs) if self.format_shape == 'MIX2d3d': if batch_inputs.ndim == 4: @@ -147,5 +147,4 @@ def forward_onesample(self, data: dict, training: bool = False) -> dict: batch_inputs, data_samples = self.blending(batch_inputs, data_samples) - data['inputs'] = batch_inputs - return data + return batch_inputs, data_samples diff --git a/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py b/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py new file mode 100644 index 0000000000..1353c811d4 --- /dev/null +++ b/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict + +from mmengine.model import BaseDataPreprocessor, ModuleDict + +from mmaction.registry import MODELS + + +@MODELS.register_module() +class MultiModalDataPreprocessor(BaseDataPreprocessor): + """Multi-Modal data pre-processor for action recognition tasks.""" + + def __init__(self, preprocessors: Dict) -> None: + super().__init__() + self.preprocessors = ModuleDict() + for name, pre_cfg in preprocessors.items(): + assert 'type' in pre_cfg, ( + 'Each data preprocessor should contain the key type, ' + f'but got {pre_cfg}') + self.preprocessors[name] = MODELS.build(pre_cfg) + + def forward(self, data: Dict, training: bool = False) -> Dict: + """Preprocesses the data into the model input format. + + Args: + data (dict): Data returned by dataloader. + training (bool): Whether to enable training time augmentation. + + Returns: + dict: Data in the same format as the model input. + """ + data = self.cast_data(data) + inputs, data_samples = data['inputs'], data['data_samples'] + for modality, modality_data in inputs.items(): + preprocessor = self.preprocessors[modality] + modality_data, data_samples = preprocessor.preprocess( + modality_data, data_samples, training) + inputs[modality] = modality_data + + data['inputs'] = inputs + data['data_samples'] = data_samples + return data diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py index 964f7b45e4..4cc8d20a4d 100644 --- a/mmaction/models/heads/__init__.py +++ b/mmaction/models/heads/__init__.py @@ -4,6 +4,7 @@ from .i3d_head import I3DHead from .mvit_head import MViTHead from .omni_head import OmniHead +from .rgbpose_head import RGBPoseHead from .slowfast_head import SlowFastHead from .timesformer_head import TimeSformerHead from .tpn_head import TPNHead @@ -16,5 +17,5 @@ __all__ = [ 'BaseHead', 'GCNHead', 'I3DHead', 'MViTHead', 'OmniHead', 'SlowFastHead', 'TPNHead', 'TRNHead', 'TSMHead', 'TSNAudioHead', 'TSNHead', - 'TimeSformerHead', 'X3DHead' + 'TimeSformerHead', 'X3DHead', 'RGBPoseHead' ] diff --git a/mmaction/models/heads/base.py b/mmaction/models/heads/base.py index 10ceae3dbb..8eafdc2cf2 100644 --- a/mmaction/models/heads/base.py +++ b/mmaction/models/heads/base.py @@ -1,18 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod -from typing import Tuple, Union +from typing import Dict, Optional, Tuple, Union import torch import torch.nn as nn import torch.nn.functional as F from mmengine.model import BaseModule from mmengine.structures import LabelData -from torch import Tensor from mmaction.evaluation import top_k_accuracy from mmaction.registry import MODELS -from mmaction.utils import (ConfigType, LabelList, OptConfigType, - OptMultiConfig, SampleList) +from mmaction.utils import ForwardResults, SampleList class AvgConsensus(nn.Module): @@ -20,14 +18,14 @@ class AvgConsensus(nn.Module): Args: dim (int): Decide which dim consensus function to apply. - Default: 1. + Defaults to 1. """ def __init__(self, dim: int = 1) -> None: super().__init__() self.dim = dim - def forward(self, x: Tensor) -> Tensor: + def forward(self, x: torch.Tensor) -> torch.Tensor: """Defines the computation performed at every call.""" return x.mean(dim=self.dim, keepdim=True) @@ -37,35 +35,34 @@ class BaseHead(BaseModule, metaclass=ABCMeta): All Head should subclass it. All subclass should overwrite: - - :meth:`init_weights`, initializing weights in some modules. - :meth:`forward`, supporting to forward both for training and testing. Args: num_classes (int): Number of classes to be classified. in_channels (int): Number of channels in input feature. - loss_cls (dict or ConfigDict): Config for building loss. - Default: dict(type='CrossEntropyLoss', loss_weight=1.0). + loss_cls (dict): Config for building loss. + Defaults to ``dict(type='CrossEntropyLoss', loss_weight=1.0)``. multi_class (bool): Determines whether it is a multi-class - recognition task. Default: False. + recognition task. Defaults to False. label_smooth_eps (float): Epsilon used in label smooth. - Reference: arxiv.org/abs/1906.02629. Default: 0. - topk (int or tuple): Top-k accuracy. Default: (1, 5). - average_clips (dict or ConfigDict, optional): Config for - averaging class scores over multiple clips. Default: None. - init_cfg (dict or ConfigDict, optional): Config to control the - initialization. Defaults to None. + Reference: arxiv.org/abs/1906.02629. Defaults to 0. + topk (int or tuple): Top-k accuracy. Defaults to ``(1, 5)``. + average_clips (dict, optional): Config for averaging class + scores over multiple clips. Defaults to None. + init_cfg (dict, optional): Config to control the initialization. + Defaults to None. """ def __init__(self, num_classes: int, in_channels: int, - loss_cls: ConfigType = dict( + loss_cls: Dict = dict( type='CrossEntropyLoss', loss_weight=1.0), multi_class: bool = False, label_smooth_eps: float = 0.0, topk: Union[int, Tuple[int]] = (1, 5), - average_clips: OptConfigType = None, - init_cfg: OptMultiConfig = None) -> None: + average_clips: Optional[Dict] = None, + init_cfg: Optional[Dict] = None) -> None: super(BaseHead, self).__init__(init_cfg=init_cfg) self.num_classes = num_classes self.in_channels = in_channels @@ -81,18 +78,19 @@ def __init__(self, self.topk = topk @abstractmethod - def forward(self, x, **kwargs) -> Tensor: + def forward(self, x, **kwargs) -> ForwardResults: """Defines the computation performed at every call.""" raise NotImplementedError - def loss(self, feats: Union[Tensor, Tuple[Tensor]], - data_samples: SampleList, **kwargs) -> dict: + def loss(self, feats: Union[torch.Tensor, Tuple[torch.Tensor]], + data_samples: SampleList, **kwargs) -> Dict: """Perform forward propagation of head and loss calculation on the features of the upstream network. Args: - feats (Tensor or Tuple[Tensor]): Features from upstream network. - data_samples (List[:obj:`ActionDataSample`]): The batch + feats (torch.Tensor | tuple[torch.Tensor]): Features from + upstream network. + data_samples (list[:obj:`ActionDataSample`]): The batch data samples. Returns: @@ -101,14 +99,14 @@ def loss(self, feats: Union[Tensor, Tuple[Tensor]], cls_scores = self(feats, **kwargs) return self.loss_by_feat(cls_scores, data_samples) - def loss_by_feat(self, cls_scores: Union[Tensor, Tuple[Tensor]], - data_samples: SampleList) -> dict: + def loss_by_feat(self, cls_scores: torch.Tensor, + data_samples: SampleList) -> Dict: """Calculate the loss based on the features extracted by the head. Args: - cls_scores (Tensor): Classification prediction results of + cls_scores (torch.Tensor): Classification prediction results of all class, has shape (batch_size, num_classes). - data_samples (List[:obj:`ActionDataSample`]): The batch + data_samples (list[:obj:`ActionDataSample`]): The batch data samples. Returns: @@ -149,32 +147,33 @@ def loss_by_feat(self, cls_scores: Union[Tensor, Tuple[Tensor]], losses['loss_cls'] = loss_cls return losses - def predict(self, feats: Union[Tensor, Tuple[Tensor]], - data_samples: SampleList, **kwargs) -> LabelList: + def predict(self, feats: Union[torch.Tensor, Tuple[torch.Tensor]], + data_samples: SampleList, **kwargs) -> SampleList: """Perform forward propagation of head and predict recognition results on the features of the upstream network. Args: - feats (Tensor or Tuple[Tensor]): Features from upstream network. - data_samples (List[:obj:`ActionDataSample`]): The batch + feats (torch.Tensor | tuple[torch.Tensor]): Features from + upstream network. + data_samples (list[:obj:`ActionDataSample`]): The batch data samples. Returns: - List[:obj:`ActionDataSample`]: Recognition results wrapped + list[:obj:`ActionDataSample`]: Recognition results wrapped by :obj:`ActionDataSample`. """ cls_scores = self(feats, **kwargs) return self.predict_by_feat(cls_scores, data_samples) - def predict_by_feat(self, cls_scores: Tensor, - data_samples: SampleList) -> LabelList: + def predict_by_feat(self, cls_scores: torch.Tensor, + data_samples: SampleList) -> SampleList: """Transform a batch of output features extracted from the head into prediction results. Args: - cls_scores (Tensor): Classification scores, has a shape - (num_classes, ) - data_samples (List[:obj:`ActionDataSample`]): The + cls_scores (torch.Tensor): Classification scores, has a shape + (B*num_segs, num_classes) + data_samples (list[:obj:`ActionDataSample`]): The annotation data of every samples. It usually includes information such as `gt_labels`. @@ -186,15 +185,17 @@ def predict_by_feat(self, cls_scores: Tensor, cls_scores = self.average_clip(cls_scores, num_segs=num_segs) pred_labels = cls_scores.argmax(dim=-1, keepdim=True).detach() - for data_sample, score, pred_lable in zip(data_samples, cls_scores, + for data_sample, score, pred_label in zip(data_samples, cls_scores, pred_labels): prediction = LabelData(item=score) - pred_label = LabelData(item=pred_lable) + pred_label = LabelData(item=pred_label) data_sample.pred_scores = prediction data_sample.pred_labels = pred_label return data_samples - def average_clip(self, cls_scores: Tensor, num_segs: int = 1) -> Tensor: + def average_clip(self, + cls_scores: torch.Tensor, + num_segs: int = 1) -> torch.Tensor: """Averaging class scores over multiple clips. Using different averaging types ('score' or 'prob' or None, @@ -202,11 +203,11 @@ def average_clip(self, cls_scores: Tensor, num_segs: int = 1) -> Tensor: class score. Only called in test mode. Args: - cls_scores (Tensor): Class scores to be averaged. + cls_scores (torch.Tensor): Class scores to be averaged. num_segs (int): Number of clips for each input sample. Returns: - Tensor: Averaged class scores. + torch.Tensor: Averaged class scores. """ if self.average_clips not in ['score', 'prob', None]: diff --git a/mmaction/models/heads/rgbpose_head.py b/mmaction/models/heads/rgbpose_head.py new file mode 100644 index 0000000000..69da4efed9 --- /dev/null +++ b/mmaction/models/heads/rgbpose_head.py @@ -0,0 +1,240 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, List, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model.weight_init import normal_init +from mmengine.structures import LabelData + +from mmaction.evaluation import top_k_accuracy +from mmaction.registry import MODELS +from mmaction.utils import SampleList +from .base import BaseHead + + +@MODELS.register_module() +class RGBPoseHead(BaseHead): + """The classification head for RGBPoseConv3D. + + Args: + num_classes (int): Number of classes to be classified. + in_channels (tuple[int]): Number of channels in input feature. + loss_cls (dict): Config for building loss. + Defaults to ``dict(type='CrossEntropyLoss')``. + loss_components (list[str]): The components of the loss. + Defaults to ``['rgb', 'pose']``. + loss_weights (float or tuple[float]): The weights of the losses. + Defaults to 1. + dropout (float): Probability of dropout layer. Default: 0.5. + init_std (float): Std value for Initiation. Default: 0.01. + """ + + def __init__(self, + num_classes: int, + in_channels: Tuple[int], + loss_cls: Dict = dict(type='CrossEntropyLoss'), + loss_components: List[str] = ['rgb', 'pose'], + loss_weights: Union[float, Tuple[float]] = 1., + dropout: float = 0.5, + init_std: float = 0.01, + **kwargs) -> None: + super().__init__(num_classes, in_channels, loss_cls, **kwargs) + if isinstance(dropout, float): + dropout = {'rgb': dropout, 'pose': dropout} + assert isinstance(dropout, dict) + + if loss_components is not None: + self.loss_components = loss_components + if isinstance(loss_weights, float): + loss_weights = [loss_weights] * len(loss_components) + assert len(loss_weights) == len(loss_components) + self.loss_weights = loss_weights + + self.dropout = dropout + self.init_std = init_std + + self.dropout_rgb = nn.Dropout(p=self.dropout['rgb']) + self.dropout_pose = nn.Dropout(p=self.dropout['pose']) + + self.fc_rgb = nn.Linear(self.in_channels[0], num_classes) + self.fc_pose = nn.Linear(self.in_channels[1], num_classes) + self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1)) + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + normal_init(self.fc_rgb, std=self.init_std) + normal_init(self.fc_pose, std=self.init_std) + + def forward(self, x: Tuple[torch.Tensor]) -> Dict: + """Defines the computation performed at every call.""" + x_rgb, x_pose = self.avg_pool(x[0]), self.avg_pool(x[1]) + x_rgb = x_rgb.view(x_rgb.size(0), -1) + x_pose = x_pose.view(x_pose.size(0), -1) + + x_rgb = self.dropout_rgb(x_rgb) + x_pose = self.dropout_pose(x_pose) + + cls_scores = dict() + cls_scores['rgb'] = self.fc_rgb(x_rgb) + cls_scores['pose'] = self.fc_pose(x_pose) + + return cls_scores + + def loss(self, feats: Tuple[torch.Tensor], data_samples: SampleList, + **kwargs) -> Dict: + """Perform forward propagation of head and loss calculation on the + features of the upstream network. + + Args: + feats (tuple[torch.Tensor]): Features from upstream network. + data_samples (list[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + dict: A dictionary of loss components. + """ + cls_scores = self(feats, **kwargs) + return self.loss_by_feat(cls_scores, data_samples) + + def loss_by_feat(self, cls_scores: Dict[str, torch.Tensor], + data_samples: SampleList) -> Dict: + """Calculate the loss based on the features extracted by the head. + + Args: + cls_scores (dict[str, torch.Tensor]): The dict of + classification scores, + data_samples (list[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + dict: A dictionary of loss components. + """ + labels = torch.stack([x.gt_labels.item for x in data_samples]) + labels = labels.squeeze() + + if labels.shape == torch.Size([]): + labels = labels.unsqueeze(0) + elif labels.dim() == 1 and labels.size()[0] == self.num_classes \ + and cls_scores.size()[0] == 1: + # Fix a bug when training with soft labels and batch size is 1. + # When using soft labels, `labels` and `cls_score` share the same + # shape. + labels = labels.unsqueeze(0) + + losses = dict() + for loss_name, weight in zip(self.loss_components, self.loss_weights): + cls_score = cls_scores[loss_name] + loss_cls = self.loss_by_scores(cls_score, labels) + loss_cls = {loss_name + '_' + k: v for k, v in loss_cls.items()} + loss_cls[f'{loss_name}_loss_cls'] *= weight + losses.update(loss_cls) + return losses + + def loss_by_scores(self, cls_scores: torch.Tensor, + labels: torch.Tensor) -> Dict: + """Calculate the loss based on the features extracted by the head. + + Args: + cls_scores (torch.Tensor): Classification prediction + results of all class, has shape (batch_size, num_classes). + labels (torch.Tensor): The labels used to calculate the loss. + + Returns: + dict: A dictionary of loss components. + """ + losses = dict() + if cls_scores.size() != labels.size(): + top_k_acc = top_k_accuracy(cls_scores.detach().cpu().numpy(), + labels.detach().cpu().numpy(), + self.topk) + for k, a in zip(self.topk, top_k_acc): + losses[f'top{k}_acc'] = torch.tensor( + a, device=cls_scores.device) + if self.label_smooth_eps != 0: + if cls_scores.size() != labels.size(): + labels = F.one_hot(labels, num_classes=self.num_classes) + labels = ((1 - self.label_smooth_eps) * labels + + self.label_smooth_eps / self.num_classes) + + loss_cls = self.loss_cls(cls_scores, labels) + # loss_cls may be dictionary or single tensor + if isinstance(loss_cls, dict): + losses.update(loss_cls) + else: + losses['loss_cls'] = loss_cls + return losses + + def predict(self, feats: Tuple[torch.Tensor], data_samples: SampleList, + **kwargs) -> SampleList: + """Perform forward propagation of head and predict recognition results + on the features of the upstream network. + + Args: + feats (tuple[torch.Tensor]): Features from upstream network. + data_samples (list[:obj:`ActionDataSample`]): The batch + data samples. + + Returns: + list[:obj:`ActionDataSample`]: Recognition results wrapped + by :obj:`ActionDataSample`. + """ + cls_scores = self(feats, **kwargs) + return self.predict_by_feat(cls_scores, data_samples) + + def predict_by_feat(self, cls_scores: Dict[str, torch.Tensor], + data_samples: SampleList) -> SampleList: + """Transform a batch of output features extracted from the head into + prediction results. + + Args: + cls_scores (dict[str, torch.Tensor]): The dict of + classification scores, + data_samples (list[:obj:`ActionDataSample`]): The + annotation data of every samples. It usually includes + information such as `gt_labels`. + + Returns: + list[:obj:`ActionDataSample`]: Recognition results wrapped + by :obj:`ActionDataSample`. + """ + pred_scores = [LabelData() for _ in range(len(data_samples))] + pred_labels = [LabelData() for _ in range(len(data_samples))] + + for name in self.loss_components: + cls_score = cls_scores[name] + cls_score, pred_label = \ + self.predict_by_scores(cls_score, data_samples) + for pred_score, pred_label, score, label in zip( + pred_scores, pred_labels, cls_score, pred_label): + pred_score.set_data({f'{name}': score}) + pred_label.set_data({f'{name}': label}) + + for data_sample, pred_score, pred_label in zip(data_samples, + pred_scores, + pred_labels): + data_sample.pred_scores = pred_score + data_sample.pred_labels = pred_label + + return data_samples + + def predict_by_scores(self, cls_scores: torch.Tensor, + data_samples: SampleList) -> Tuple: + """Transform a batch of output features extracted from the head into + prediction results. + + Args: + cls_scores (torch.Tensor): Classification scores, has a shape + (B*num_segs, num_classes) + data_samples (list[:obj:`ActionDataSample`]): The annotation + data of every samples. + + Returns: + tuple: A tuple of the averaged classification scores and + prediction labels. + """ + + num_segs = cls_scores.shape[0] // len(data_samples) + cls_scores = self.average_clip(cls_scores, num_segs=num_segs) + pred_labels = cls_scores.argmax(dim=-1, keepdim=True).detach() + return cls_scores, pred_labels diff --git a/mmaction/models/recognizers/__init__.py b/mmaction/models/recognizers/__init__.py index 1b7db21451..447f6333dc 100644 --- a/mmaction/models/recognizers/__init__.py +++ b/mmaction/models/recognizers/__init__.py @@ -2,11 +2,12 @@ from .base import BaseRecognizer from .recognizer2d import Recognizer2D from .recognizer3d import Recognizer3D +from .recognizer3d_mm import MMRecognizer3D from .recognizer_audio import RecognizerAudio from .recognizer_gcn import RecognizerGCN from .recognizer_omni import RecognizerOmni __all__ = [ 'BaseRecognizer', 'RecognizerGCN', 'Recognizer2D', 'Recognizer3D', - 'RecognizerAudio', 'RecognizerOmni' + 'RecognizerAudio', 'RecognizerOmni', 'MMRecognizer3D' ] diff --git a/mmaction/models/recognizers/recognizer3d_mm.py b/mmaction/models/recognizers/recognizer3d_mm.py new file mode 100644 index 0000000000..1d7099b3c3 --- /dev/null +++ b/mmaction/models/recognizers/recognizer3d_mm.py @@ -0,0 +1,50 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Tuple + +import torch + +from mmaction.registry import MODELS +from mmaction.utils.typing import OptSampleList +from .base import BaseRecognizer + + +@MODELS.register_module() +class MMRecognizer3D(BaseRecognizer): + """Multi-modal 3D recognizer model framework.""" + + def extract_feat(self, + inputs: Dict[str, torch.Tensor], + stage: str = 'backbone', + data_samples: OptSampleList = None, + test_mode: bool = False) -> Tuple: + """Extract features. + + Args: + inputs (dict[str, torch.Tensor]): The multi-modal input data. + stage (str): Which stage to output the feature. + Defaults to ``'backbone'``. + data_samples (list[:obj:`ActionDataSample`], optional): Action data + samples, which are only needed in training. Defaults to None. + test_mode (bool): Whether in test mode. Defaults to False. + + Returns: + tuple[torch.Tensor]: The extracted features. + dict: A dict recording the kwargs for downstream + pipeline. + """ + # [N, num_views, C, T, H, W] -> + # [N * num_views, C, T, H, W] + for m, m_data in inputs.items(): + m_data = m_data.reshape((-1, ) + m_data.shape[2:]) + inputs[m] = m_data + + # Record the kwargs required by `loss` and `predict` + loss_predict_kwargs = dict() + + x = self.backbone(**inputs) + if stage == 'backbone': + return x, loss_predict_kwargs + + if self.with_cls_head and stage == 'head': + x = self.cls_head(x, **loss_predict_kwargs) + return x, loss_predict_kwargs diff --git a/mmaction/models/utils/blending_utils.py b/mmaction/models/utils/blending_utils.py index 64808d32f7..babea75d05 100644 --- a/mmaction/models/utils/blending_utils.py +++ b/mmaction/models/utils/blending_utils.py @@ -1,11 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod -from typing import Union +from typing import List, Optional, Tuple, Union import numpy as np import torch import torch.nn.functional as F -from torch import Tensor from torch.distributions.beta import Beta from mmaction.registry import MODELS @@ -25,38 +24,39 @@ def __init__(self, num_classes: int) -> None: self.num_classes = num_classes @abstractmethod - def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple: + def do_blending(self, imgs: torch.Tensor, label: torch.Tensor, + **kwargs) -> Tuple: """Blending images process.""" raise NotImplementedError - def __call__(self, imgs: Tensor, batch_data_samples: SampleList, - **kwargs) -> tuple: + def __call__(self, imgs: torch.Tensor, batch_data_samples: SampleList, + **kwargs) -> Tuple: """Blending data in a mini-batch. Images are float tensors with the shape of (B, N, C, H, W) for 2D recognizers or (B, N, C, T, H, W) for 3D recognizers. Besides, labels are converted from hard labels to soft labels. - Hard labels are integer tensors with the shape of (B, 1) and all of the + Hard labels are integer tensors with the shape of (B, ) and all of the elements are in the range [0, num_classes - 1]. - Soft labels (probablity distribution over classes) are float tensors - with the shape of (B, 1, num_classes) and all of the elements are in + Soft labels (probability distribution over classes) are float tensors + with the shape of (B, num_classes) and all of the elements are in the range [0, 1]. Args: - imgs (Tensor): Model input images, float tensor with the + imgs (torch.Tensor): Model input images, float tensor with the shape of (B, N, C, H, W) or (B, N, C, T, H, W). batch_data_samples (List[:obj:`ActionDataSample`]): The batch data samples. It usually includes information such as `gt_labels`. Returns: - mixed_imgs (Tensor): Blending images, float tensor with the + mixed_imgs (torch.Tensor): Blending images, float tensor with the same shape of the input imgs. batch_data_samples (List[:obj:`ActionDataSample`]): The modified batch data samples. ``gt_labels`` in each data sample are converted from a hard label to a blended soft label, float - tensor with the shape of (1, num_classes) and all elements are + tensor with the shape of (num_classes, ) and all elements are in range [0, 1]. """ label = [x.gt_labels.item for x in batch_data_samples] @@ -90,13 +90,14 @@ def __init__(self, num_classes: int, alpha: float = .2) -> None: super().__init__(num_classes=num_classes) self.beta = Beta(alpha, alpha) - def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple: + def do_blending(self, imgs: torch.Tensor, label: torch.Tensor, + **kwargs) -> Tuple: """Blending images with mixup. Args: - imgs (Tensor): Model input images, float tensor with the + imgs (torch.Tensor): Model input images, float tensor with the shape of (B, N, C, H, W) or (B, N, C, T, H, W). - label (Tensor): One hot labels, integer tensor with the shape + label (torch.Tensor): One hot labels, integer tensor with the shape of (B, num_classes). Returns: @@ -132,7 +133,7 @@ def __init__(self, num_classes: int, alpha: float = .2) -> None: self.beta = Beta(alpha, alpha) @staticmethod - def rand_bbox(img_size: torch.Size, lam: Tensor) -> tuple: + def rand_bbox(img_size: torch.Size, lam: torch.Tensor) -> Tuple: """Generate a random boudning box.""" w = img_size[-1] h = img_size[-2] @@ -151,13 +152,14 @@ def rand_bbox(img_size: torch.Size, lam: Tensor) -> tuple: return bbx1, bby1, bbx2, bby2 - def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple: + def do_blending(self, imgs: torch.Tensor, label: torch.Tensor, + **kwargs) -> Tuple: """Blending images with cutmix. Args: - imgs (Tensor): Model input images, float tensor with the + imgs (torch.Tensor): Model input images, float tensor with the shape of (B, N, C, H, W) or (B, N, C, T, H, W). - label (Tensor): One hot labels, integer tensor with the shape + label (torch.Tensor): One hot labels, integer tensor with the shape of (B, num_classes). Returns: @@ -209,7 +211,9 @@ class RandomBatchAugment(BaseMiniBatchBlending): and to do nothing is 0.2. """ - def __init__(self, augments: Union[dict, list], probs=None): + def __init__(self, + augments: Union[dict, list], + probs: Optional[Union[float, List[float]]] = None) -> None: if not isinstance(augments, (tuple, list)): augments = [augments] @@ -235,7 +239,8 @@ def __init__(self, augments: Union[dict, list], probs=None): self.probs = probs - def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple: + def do_blending(self, imgs: torch.Tensor, label: torch.Tensor, + **kwargs) -> Tuple: """Randomly apply batch augmentations to the batch inputs and batch data samples.""" aug_index = np.random.choice(len(self.augments), p=self.probs) diff --git a/tests/datasets/transforms/test_formating.py b/tests/datasets/transforms/test_formating.py index 842d2dbf27..8e741c24e5 100644 --- a/tests/datasets/transforms/test_formating.py +++ b/tests/datasets/transforms/test_formating.py @@ -101,8 +101,8 @@ def test_repr(self): type='PackActionInputs', meta_keys=['flip_direction', 'img_shape']) transform = TRANSFORMS.build(cfg) self.assertEqual( - repr(transform), - "PackActionInputs(meta_keys=['flip_direction', 'img_shape'])") + repr(transform), 'PackActionInputs(collect_keys=None, ' + "meta_keys=['flip_direction', 'img_shape'])") class TestPackLocalizationInputs(unittest.TestCase): @@ -184,8 +184,24 @@ def test_format_shape(): target_keys = ['imgs', 'input_shape'] assert assert_dict_has_keys(results, target_keys) - assert repr(format_shape) == format_shape.__class__.__name__ + \ - "(input_format='NCTHW')" + # `NCTHW` input format with imgs and heatmap_imgs + results = dict( + imgs=np.random.randn(6, 224, 224, 3), + heatmap_imgs=np.random.randn(12, 17, 56, 56), + num_clips=2, + clip_len=dict(RGB=3, Pose=6)) + + results = format_shape(results) + assert results['input_shape'] == (2, 3, 3, 224, 224) + assert results['heatmap_input_shape'] == (2, 17, 6, 56, 56) + + assert repr(format_shape) == "FormatShape(input_format='NCTHW')" + + # `NCTHW_Heatmap` input format + results = dict( + imgs=np.random.randn(12, 17, 56, 56), num_clips=2, clip_len=6) + format_shape = FormatShape('NCTHW_Heatmap') + assert format_shape(results)['input_shape'] == (2, 17, 6, 56, 56) # `NCHW_Flow` input format results = dict(imgs=np.random.randn(6, 224, 224), num_clips=1, clip_len=3) diff --git a/tests/datasets/transforms/test_loading.py b/tests/datasets/transforms/test_loading.py index 5413475a92..035a2213cc 100644 --- a/tests/datasets/transforms/test_loading.py +++ b/tests/datasets/transforms/test_loading.py @@ -260,21 +260,23 @@ def test_pims_decode(self): video_result['frame_inds']), 256, 340, 3) def test_decord_init(self): - target_keys = ['video_reader', 'total_frames'] + target_keys = ['video_reader', 'total_frames', 'avg_fps'] video_result = copy.deepcopy(self.video_results) decord_init = DecordInit() decord_init_result = decord_init(video_result) assert assert_dict_has_keys(decord_init_result, target_keys) assert decord_init_result['total_frames'] == len( decord_init_result['video_reader']) + assert decord_init_result['avg_fps'] == 30 + assert repr(decord_init) == (f'{decord_init.__class__.__name__}(' f'io_backend=disk, ' - f'num_threads={1})') + f'num_threads=1)') def test_decord_decode(self): target_keys = ['frame_inds', 'imgs', 'original_shape'] - # test Decord with 2 dim input and start_index = 0 + # test Decord with 2 dim input using accurate mode video_result = copy.deepcopy(self.video_results) video_result['frame_inds'] = np.arange(0, self.total_frames, 3)[:, np.newaxis] @@ -289,7 +291,7 @@ def test_decord_decode(self): assert np.shape(decord_decode_result['imgs']) == (len( video_result['frame_inds']), 256, 340, 3) - # test Decord with 1 dim input and start_index = 0 + # test Decord with 1 dim input using accurate mode video_result = copy.deepcopy(self.video_results) video_result['frame_inds'] = np.arange(0, self.total_frames, 3) decord_init = DecordInit() @@ -303,7 +305,7 @@ def test_decord_decode(self): assert np.shape(decord_decode_result['imgs']) == (len( video_result['frame_inds']), 256, 340, 3) - # test Decord with 2 dim input and start_index = 0 + # test Decord with 2 dim input using efficient mode video_result = copy.deepcopy(self.video_results) video_result['frame_inds'] = np.arange(0, self.total_frames, 3)[:, np.newaxis] @@ -311,14 +313,14 @@ def test_decord_decode(self): decord_init_result = decord_init(video_result) video_result['video_reader'] = decord_init_result['video_reader'] - decord_decode = DecordDecode() + decord_decode = DecordDecode(mode='efficient') decord_decode_result = decord_decode(video_result) assert assert_dict_has_keys(decord_decode_result, target_keys) assert decord_decode_result['original_shape'] == (256, 340) assert np.shape(decord_decode_result['imgs']) == (len( video_result['frame_inds']), 256, 340, 3) - # test Decord with 1 dim input + # test Decord with 1 dim input using efficient mode video_result = copy.deepcopy(self.video_results) video_result['frame_inds'] = np.arange(1, self.total_frames, 3) decord_init = DecordInit() diff --git a/tests/datasets/transforms/test_pose_transforms.py b/tests/datasets/transforms/test_pose_transforms.py index d65d450124..913447f938 100644 --- a/tests/datasets/transforms/test_pose_transforms.py +++ b/tests/datasets/transforms/test_pose_transforms.py @@ -13,10 +13,11 @@ from mmaction.datasets.transforms import (GeneratePoseTarget, GenSkeFeat, JointToBone, LoadKineticsPose, - MergeSkeFeat, PadTo, PoseCompact, - PoseDecode, PreNormalize2D, - PreNormalize3D, ToMotion, - UniformSampleFrames) + MergeSkeFeat, MMCompact, MMDecode, + MMUniformSampleFrames, PadTo, + PoseCompact, PoseDecode, + PreNormalize2D, PreNormalize3D, + ToMotion, UniformSampleFrames) class TestPoseTransforms: @@ -126,23 +127,29 @@ def test_generate_pose_target(): modality='Pose') generate_pose_target = GeneratePoseTarget( - sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=()) + sigma=1, + with_kp=True, + left_kp=(1, ), + right_kp=(2, ), + left_limb=(0, ), + right_limb=(1, ), + skeletons=()) assert str(generate_pose_target) == ('GeneratePoseTarget(sigma=1, ' 'use_score=True, with_kp=True, ' 'with_limb=False, skeletons=(), ' - 'double=False, left_kp=(0,), ' - 'right_kp=(1,))') - return_results = generate_pose_target(results) - assert return_results['imgs'].shape == (8, 64, 64, 3) + 'double=False, left_kp=(1,), ' + 'right_kp=(2,), left_limb=(0,), ' + 'right_limb=(1,), scaling=1.0)') + return_results = generate_pose_target(copy.deepcopy(results)) + assert return_results['imgs'].shape == (8, 3, 64, 64) assert_array_almost_equal(return_results['imgs'][0], return_results['imgs'][1]) results = dict(img_shape=img_shape, keypoint=kp, modality='Pose') - generate_pose_target = GeneratePoseTarget( - sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=()) - return_results = generate_pose_target(results) - assert return_results['imgs'].shape == (8, 64, 64, 3) + generate_pose_target = GeneratePoseTarget(sigma=1, with_kp=True) + return_results = generate_pose_target(copy.deepcopy(results)) + assert return_results['imgs'].shape == (8, 3, 64, 64) assert_array_almost_equal(return_results['imgs'][0], return_results['imgs'][1]) @@ -150,37 +157,23 @@ def test_generate_pose_target(): sigma=1, with_kp=False, with_limb=True, - left_kp=(0, ), - right_kp=(1, ), skeletons=((0, 1), (1, 2), (0, 2))) - return_results = generate_pose_target(results) - assert return_results['imgs'].shape == (8, 64, 64, 3) + return_results = generate_pose_target(copy.deepcopy(results)) + assert return_results['imgs'].shape == (8, 3, 64, 64) assert_array_almost_equal(return_results['imgs'][0], return_results['imgs'][1]) generate_pose_target = GeneratePoseTarget( sigma=1, - with_kp=True, - with_limb=True, - left_kp=(0, ), - right_kp=(1, ), - skeletons=((0, 1), (1, 2), (0, 2))) - return_results = generate_pose_target(results) - assert return_results['imgs'].shape == (8, 64, 64, 6) - assert_array_almost_equal(return_results['imgs'][0], - return_results['imgs'][1]) - - generate_pose_target = GeneratePoseTarget( - sigma=1, - with_kp=True, + with_kp=False, with_limb=True, double=True, - left_kp=(0, ), - right_kp=(1, ), + left_limb=(0, ), + right_limb=(1, ), skeletons=((0, 1), (1, 2), (0, 2))) - return_results = generate_pose_target(results) + return_results = generate_pose_target(copy.deepcopy(results)) imgs = return_results['imgs'] - assert imgs.shape == (16, 64, 64, 6) + assert imgs.shape == (16, 3, 64, 64) assert_array_almost_equal(imgs[0], imgs[1]) assert_array_almost_equal(imgs[:8, 2], imgs[8:, 2, :, ::-1]) assert_array_almost_equal(imgs[:8, 0], imgs[8:, 1, :, ::-1]) @@ -197,8 +190,8 @@ def test_generate_pose_target(): keypoint_score=kpscore, modality='Pose') generate_pose_target = GeneratePoseTarget( - sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=()) - return_results = generate_pose_target(results) + sigma=1, with_kp=True, skeletons=()) + return_results = generate_pose_target(copy.deepcopy(results)) assert_array_almost_equal(return_results['imgs'], 0) img_shape = (64, 64) @@ -215,10 +208,8 @@ def test_generate_pose_target(): sigma=1, with_kp=False, with_limb=True, - left_kp=(0, ), - right_kp=(1, ), skeletons=((0, 1), (1, 2), (0, 2))) - return_results = generate_pose_target(results) + return_results = generate_pose_target(copy.deepcopy(results)) assert_array_almost_equal(return_results['imgs'], 0) img_shape = (64, 64) @@ -231,13 +222,12 @@ def test_generate_pose_target(): keypoint=kp, keypoint_score=kpscore, modality='Pose') - generate_pose_target = GeneratePoseTarget( - sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=()) - return_results = generate_pose_target(results) + generate_pose_target = GeneratePoseTarget(sigma=1, with_kp=True) + return_results = generate_pose_target(copy.deepcopy(results)) assert_array_almost_equal(return_results['imgs'], 0) img_shape = (64, 64) - kp = np.array([[[[124, 124], [140, 140], [124, 140]]]]) + kp = np.array([[[[124., 124.], [140., 140.], [124., 140.]]]]) kpscore = np.array([[[0., 0., 0.]]]) kp = np.concatenate([kp] * 8, axis=1) kpscore = np.concatenate([kpscore] * 8, axis=1) @@ -250,8 +240,6 @@ def test_generate_pose_target(): sigma=1, with_kp=False, with_limb=True, - left_kp=(0, ), - right_kp=(1, ), skeletons=((0, 1), (1, 2), (0, 2))) return_results = generate_pose_target(results) assert_array_almost_equal(return_results['imgs'], 0) @@ -587,3 +575,143 @@ def test_pose_decode(): decode_results = pose_decode(results) assert_array_almost_equal(decode_results['keypoint'], kp) assert_array_almost_equal(decode_results['keypoint_score'], kpscore) + + @staticmethod + def test_mm_uniform_sample_frames(): + results = dict(total_frames=64, modality='Pose') + sampling = MMUniformSampleFrames( + clip_len=dict(RGB=8, Pose=32), num_clips=1, test_mode=True, seed=0) + assert repr(sampling) == ('MMUniformSampleFrames(' + "clip_len={'RGB': 8, 'Pose': 32}, " + 'num_clips=1, test_mode=True, seed=0)') + + sampling_results = sampling(results) + assert sampling_results['clip_len'] == dict(RGB=8, Pose=32) + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert sampling_results['modality'] == ['RGB', 'Pose'] + assert_array_equal(sampling_results['RGB_inds'], + np.array([4, 15, 21, 24, 35, 43, 51, 63])) + assert_array_equal( + sampling_results['Pose_inds'], + np.array([ + 0, 3, 5, 6, 9, 11, 13, 15, 17, 19, 21, 22, 24, 27, 28, 30, 32, + 34, 36, 39, 40, 43, 45, 46, 48, 51, 53, 55, 57, 58, 61, 62 + ])) + + results = dict(total_frames=64, modality='Pose') + sampling = MMUniformSampleFrames( + clip_len=dict(RGB=8, Pose=32), + num_clips=10, + test_mode=True, + seed=0) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == dict(RGB=8, Pose=32) + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 10 + assert sampling_results['modality'] == ['RGB', 'Pose'] + assert len(sampling_results['RGB_inds']) == 80 + assert len(sampling_results['Pose_inds']) == 320 + + results = dict(total_frames=64, modality='Pose') + sampling = MMUniformSampleFrames( + clip_len=dict(RGB=8, Pose=32), num_clips=1, test_mode=False) + sampling_results = sampling(results) + assert sampling_results['clip_len'] == dict(RGB=8, Pose=32) + assert sampling_results['frame_interval'] is None + assert sampling_results['num_clips'] == 1 + assert len(sampling_results['RGB_inds']) == 8 + assert len(sampling_results['Pose_inds']) == 32 + + @staticmethod + def test_mm_decode(): + mm_decode = MMDecode() + + # Pose only test + pose_raw_results = dict( + modality=['Pose'], + Pose_inds=np.array([2, 4, 6, 8, 10]), + keypoint=np.random.random([1, 16, 17, 2]), + img_shape=(1080, 1920)) + rgb_raw_results = dict( + modality=['RGB'], + RGB_inds=np.array([2, 4, 6, 8, 10]), + frame_dir=osp.join(osp.dirname(__file__), '../../data/test')) + + # test pose w/o `keypoint_score` + mm_decode(copy.deepcopy(pose_raw_results)) + + # test pose with `keypoint_score` + pose_raw_results['keypoint_score'] = np.random.random([1, 16, 17]) + pose_results = mm_decode(copy.deepcopy(pose_raw_results)) + + # test rgb + rgb_results = mm_decode(copy.deepcopy(rgb_raw_results)) + + # test pose and rgb + pose_rgb_raw_results = { + **rgb_raw_results, + **pose_raw_results, 'modality': ['RGB', 'Pose'] + } + pose_rgb_results = mm_decode(copy.deepcopy(pose_rgb_raw_results)) + + assert_array_equal(pose_rgb_results['keypoint_score'], + pose_results['keypoint_score']) + scaled_keypoint = copy.deepcopy(pose_results['keypoint']) + oh, ow = pose_results['img_shape'] + nh, nw = pose_rgb_results['img_shape'] + scaled_keypoint[..., 0] *= (nw / ow) + scaled_keypoint[..., 1] *= (nh / oh) + assert_array_equal(pose_rgb_results['keypoint'], scaled_keypoint) + assert_array_equal(pose_rgb_results['imgs'], rgb_results['imgs']) + assert assert_dict_has_keys( + pose_rgb_results, ['filename', 'img_shape', 'original_shape']) + assert repr(mm_decode) == 'MMDecode(io_backend=disk)' + + @staticmethod + def test_mm_compact(): + results = {} + results['img_shape'] = (100, 100) + fake_kp = np.zeros([1, 4, 2, 2]) + fake_kp[:, :, 0] = [10, 10] + fake_kp[:, :, 1] = [90, 90] + results['keypoint'] = fake_kp + results['imgs'] = list(np.zeros([3, 100, 100, 3])) + + pose_compact = MMCompact( + padding=0, threshold=0, hw_ratio=1, allow_imgpad=False) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (80, 80) + assert ret['imgs'][0].shape[:-1] == (80, 80) + assert str(pose_compact) == ( + 'MMCompact(padding=0, threshold=0, hw_ratio=(1, 1), ' + 'allow_imgpad=False)') + + pose_compact = MMCompact( + padding=0.3, threshold=0, hw_ratio=1, allow_imgpad=False) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (100, 100) + assert ret['imgs'][0].shape[:-1] == (100, 100) + + pose_compact = MMCompact( + padding=0.3, threshold=0, hw_ratio=1, allow_imgpad=True) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (104, 104) + assert ret['imgs'][0].shape[:-1] == (104, 104) + + pose_compact = MMCompact( + padding=0, threshold=100, hw_ratio=1, allow_imgpad=False) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (100, 100) + assert ret['imgs'][0].shape[:-1] == (100, 100) + + pose_compact = MMCompact( + padding=0, threshold=0, hw_ratio=0.75, allow_imgpad=True) + inp = copy.deepcopy(results) + ret = pose_compact(inp) + assert ret['img_shape'] == (80, 106) + assert ret['imgs'][0].shape[:-1] == (80, 106) diff --git a/tests/models/backbones/test_resnet3d_slowfast.py b/tests/models/backbones/test_resnet3d_slowfast.py index a3de73a620..d91e183583 100644 --- a/tests/models/backbones/test_resnet3d_slowfast.py +++ b/tests/models/backbones/test_resnet3d_slowfast.py @@ -11,18 +11,13 @@ def test_slowfast_backbone(): """Test SlowFast backbone.""" with pytest.raises(TypeError): # cfg should be a dict - ResNet3dSlowFast(None, slow_pathway=list(['foo', 'bar'])) - with pytest.raises(TypeError): - # pretrained should be a str - sf_50 = ResNet3dSlowFast(dict(foo='bar')) - sf_50.init_weights() + ResNet3dSlowFast(slow_pathway=list(['foo', 'bar'])) with pytest.raises(KeyError): # pathway type should be implemented - ResNet3dSlowFast(None, slow_pathway=dict(type='resnext')) + ResNet3dSlowFast(slow_pathway=dict(type='resnext')) # test slowfast with slow inflated sf_50_inflate = ResNet3dSlowFast( - None, slow_pathway=dict( type='resnet3d', depth=50, @@ -56,14 +51,7 @@ def test_slowfast_backbone(): # slowfast w/o lateral connection inference test input_shape = (1, 3, 8, 64, 64) imgs = generate_backbone_demo_inputs(input_shape) - # parrots 3dconv is only implemented on gpu - if torch.__version__ == 'parrots': - if torch.cuda.is_available(): - sf_50_wo_lateral = sf_50_wo_lateral.cuda() - imgs_gpu = imgs.cuda() - feat = sf_50_wo_lateral(imgs_gpu) - else: - feat = sf_50_wo_lateral(imgs) + feat = sf_50_wo_lateral(imgs) assert isinstance(feat, tuple) assert feat[0].shape == torch.Size([1, 2048, 1, 2, 2]) @@ -104,21 +92,14 @@ def test_slowfast_backbone(): assert param.requires_grad is True # test slowfast with normal config - sf_50 = ResNet3dSlowFast(None) + sf_50 = ResNet3dSlowFast() sf_50.init_weights() sf_50.train() # slowfast inference test input_shape = (1, 3, 8, 64, 64) imgs = generate_backbone_demo_inputs(input_shape) - # parrots 3dconv is only implemented on gpu - if torch.__version__ == 'parrots': - if torch.cuda.is_available(): - sf_50 = sf_50.cuda() - imgs_gpu = imgs.cuda() - feat = sf_50(imgs_gpu) - else: - feat = sf_50(imgs) + feat = sf_50(imgs) assert isinstance(feat, tuple) assert feat[0].shape == torch.Size([1, 2048, 1, 2, 2]) diff --git a/tests/models/backbones/test_resnet3d_slowonly.py b/tests/models/backbones/test_resnet3d_slowonly.py index 9603469c37..47c7036451 100644 --- a/tests/models/backbones/test_resnet3d_slowonly.py +++ b/tests/models/backbones/test_resnet3d_slowonly.py @@ -10,7 +10,7 @@ def test_slowonly_backbone(): """Test SlowOnly backbone.""" with pytest.raises(AssertionError): # SlowOnly should contain no lateral connection - ResNet3dSlowOnly(50, None, lateral=True) + ResNet3dSlowOnly(depth=50, pretrained=None, lateral=True) # test SlowOnly for PoseC3D so_50 = ResNet3dSlowOnly( @@ -31,7 +31,7 @@ def test_slowonly_backbone(): so_50.train() # test SlowOnly with normal config - so_50 = ResNet3dSlowOnly(50, None) + so_50 = ResNet3dSlowOnly(depth=50, pretrained=None) so_50.init_weights() so_50.train() diff --git a/tests/models/backbones/test_rgbposeconv3d.py b/tests/models/backbones/test_rgbposeconv3d.py new file mode 100644 index 0000000000..848a73ab45 --- /dev/null +++ b/tests/models/backbones/test_rgbposeconv3d.py @@ -0,0 +1,27 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmaction.models import RGBPoseConv3D +from mmaction.testing import generate_backbone_demo_inputs + + +def test_rgbposeconv3d(): + """Test RGBPoseConv3D backbone.""" + + with pytest.raises(AssertionError): + RGBPoseConv3D(pose_drop_path=1.1, rgb_drop_path=1.1) + + rgbposec3d = RGBPoseConv3D() + rgbposec3d.init_weights() + rgbposec3d.train() + + imgs_shape = (1, 3, 8, 224, 224) + heatmap_imgs_shape = (1, 17, 32, 56, 56) + imgs = generate_backbone_demo_inputs(imgs_shape) + heatmap_imgs = generate_backbone_demo_inputs(heatmap_imgs_shape) + + (x_rgb, x_pose) = rgbposec3d(imgs, heatmap_imgs) + + assert x_rgb.shape == torch.Size([1, 2048, 8, 7, 7]) + assert x_pose.shape == torch.Size([1, 512, 32, 7, 7]) diff --git a/tests/models/data_preprocessors/__init__.py b/tests/models/data_preprocessors/__init__.py new file mode 100644 index 0000000000..ef101fec61 --- /dev/null +++ b/tests/models/data_preprocessors/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/tests/models/data_preprocessors/test_data_preprocessor.py b/tests/models/data_preprocessors/test_data_preprocessor.py new file mode 100644 index 0000000000..a4a3d851d7 --- /dev/null +++ b/tests/models/data_preprocessors/test_data_preprocessor.py @@ -0,0 +1,97 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from copy import deepcopy + +import pytest +import torch +from numpy.testing import assert_array_equal + +from mmaction.models import ActionDataPreprocessor +from mmaction.structures import ActionDataSample +from mmaction.utils import register_all_modules + + +def generate_dummy_data(batch_size, input_shape): + data = { + 'inputs': + [torch.randint(0, 255, input_shape) for _ in range(batch_size)], + 'data_samples': + [ActionDataSample().set_gt_labels(2) for _ in range(batch_size)] + } + return data + + +def test_data_preprocessor(): + with pytest.raises(ValueError): + ActionDataPreprocessor( + mean=[1, 1], std=[0, 0], format_shape='NCTHW_Heatmap') + with pytest.raises(ValueError): + psr = ActionDataPreprocessor(format_shape='NCTHW_Heatmap', to_rgb=True) + psr(generate_dummy_data(1, (3, 224, 224))) + + raw_data = generate_dummy_data(2, (1, 3, 8, 224, 224)) + psr = ActionDataPreprocessor( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW') + data = psr(deepcopy(raw_data)) + assert data['inputs'].shape == (2, 1, 3, 8, 224, 224) + assert_array_equal(data['inputs'][0], + (raw_data['inputs'][0] - psr.mean) / psr.std) + assert_array_equal(data['inputs'][1], + (raw_data['inputs'][1] - psr.mean) / psr.std) + + psr = ActionDataPreprocessor(format_shape='NCTHW', to_rgb=True) + data = psr(deepcopy(raw_data)) + assert data['inputs'].shape == (2, 1, 3, 8, 224, 224) + assert_array_equal(data['inputs'][0], raw_data['inputs'][0][:, [2, 1, 0]]) + assert_array_equal(data['inputs'][1], raw_data['inputs'][1][:, [2, 1, 0]]) + + register_all_modules() + psr = ActionDataPreprocessor( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW', + blending=dict(type='MixupBlending', num_classes=5)) + data = psr(deepcopy(raw_data), training=True) + assert data['data_samples'][0].gt_labels.item.shape == (5, ) + assert data['data_samples'][1].gt_labels.item.shape == (5, ) + + raw_data = generate_dummy_data(2, (1, 3, 224, 224)) + psr = ActionDataPreprocessor( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCHW', + to_rgb=True) + data = psr(deepcopy(raw_data)) + assert_array_equal(data['inputs'][0], + (raw_data['inputs'][0][:, [2, 1, 0]] - psr.mean) / + psr.std) + assert_array_equal(data['inputs'][1], + (raw_data['inputs'][1][:, [2, 1, 0]] - psr.mean) / + psr.std) + + psr = ActionDataPreprocessor() + data = psr(deepcopy(raw_data)) + assert data['inputs'].shape == (2, 1, 3, 224, 224) + assert_array_equal(data['inputs'][0], raw_data['inputs'][0]) + assert_array_equal(data['inputs'][1], raw_data['inputs'][1]) + + raw_2d_data = generate_dummy_data(2, (3, 224, 224)) + raw_3d_data = generate_dummy_data(2, (1, 3, 8, 224, 224)) + raw_data = (raw_2d_data, raw_3d_data) + + psr = ActionDataPreprocessor( + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='MIX2d3d') + data = psr(raw_data) + assert_array_equal(data[0]['inputs'][0], + (raw_2d_data['inputs'][0] - psr.mean.view(-1, 1, 1)) / + psr.std.view(-1, 1, 1)) + assert_array_equal(data[0]['inputs'][1], + (raw_2d_data['inputs'][1] - psr.mean.view(-1, 1, 1)) / + psr.std.view(-1, 1, 1)) + assert_array_equal(data[1]['inputs'][0], + (raw_3d_data['inputs'][0] - psr.mean) / psr.std) + assert_array_equal(data[1]['inputs'][1], + (raw_3d_data['inputs'][1] - psr.mean) / psr.std) diff --git a/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py b/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py new file mode 100644 index 0000000000..35483bd5d9 --- /dev/null +++ b/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import pytest +import torch +from numpy.testing import assert_array_equal + +from mmaction.models import MultiModalDataPreprocessor +from mmaction.structures import ActionDataSample +from mmaction.utils import register_all_modules + + +def generate_dummy_data(batch_size, input_keys, input_shapes): + data = dict() + data['data_samples'] = [ + ActionDataSample().set_gt_labels(2) for _ in range(batch_size) + ] + data['inputs'] = dict() + for key, shape in zip(input_keys, input_shapes): + data['inputs'][key] = [ + torch.randint(0, 255, shape) for _ in range(batch_size) + ] + + return data + + +def test_multimodal_data_preprocessor(): + with pytest.raises(AssertionError): + MultiModalDataPreprocessor( + preprocessors=dict(imgs=dict(format_shape='NCTHW'))) + + register_all_modules() + data_keys = ('imgs', 'heatmap_imgs') + data_shapes = ((1, 3, 8, 224, 224), (1, 17, 32, 64, 64)) + raw_data = generate_dummy_data(2, data_keys, data_shapes) + + psr = MultiModalDataPreprocessor( + preprocessors=dict( + imgs=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + heatmap_imgs=dict(type='ActionDataPreprocessor'))) + + data = psr(copy.deepcopy(raw_data)) + assert data['inputs']['imgs'].shape == (2, 1, 3, 8, 224, 224) + assert data['inputs']['heatmap_imgs'].shape == (2, 1, 17, 32, 64, 64) + psr_imgs = psr.preprocessors['imgs'] + assert_array_equal(data['inputs']['imgs'][0], + (raw_data['inputs']['imgs'][0] - psr_imgs.mean) / + psr_imgs.std) + assert_array_equal(data['inputs']['imgs'][1], + (raw_data['inputs']['imgs'][1] - psr_imgs.mean) / + psr_imgs.std) + assert_array_equal(data['inputs']['heatmap_imgs'][0], + raw_data['inputs']['heatmap_imgs'][0]) + assert_array_equal(data['inputs']['heatmap_imgs'][1], + raw_data['inputs']['heatmap_imgs'][1]) + + data_keys = ('imgs_2D', 'imgs_3D') + data_shapes = ((1, 3, 224, 224), (1, 3, 8, 224, 224)) + raw_data = generate_dummy_data(2, data_keys, data_shapes) + + psr = MultiModalDataPreprocessor( + preprocessors=dict( + imgs_2D=dict( + type='ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCHW'), + imgs_3D=dict( + type='ActionDataPreprocessor', + mean=[127.5, 127.5, 127.5], + std=[57.5, 57.5, 57.5], + format_shape='NCTHW'))) + + data = psr(copy.deepcopy(raw_data)) + assert data['inputs']['imgs_2D'].shape == (2, 1, 3, 224, 224) + assert data['inputs']['imgs_3D'].shape == (2, 1, 3, 8, 224, 224) + psr_imgs2d = psr.preprocessors['imgs_2D'] + psr_imgs3d = psr.preprocessors['imgs_3D'] + assert_array_equal(data['inputs']['imgs_2D'][0], + (raw_data['inputs']['imgs_2D'][0] - psr_imgs2d.mean) / + psr_imgs2d.std) + assert_array_equal(data['inputs']['imgs_2D'][1], + (raw_data['inputs']['imgs_2D'][1] - psr_imgs2d.mean) / + psr_imgs2d.std) + assert_array_equal(data['inputs']['imgs_3D'][0], + (raw_data['inputs']['imgs_3D'][0] - psr_imgs3d.mean) / + psr_imgs3d.std) + assert_array_equal(data['inputs']['imgs_3D'][1], + (raw_data['inputs']['imgs_3D'][1] - psr_imgs3d.mean) / + psr_imgs3d.std) diff --git a/tests/models/heads/test_rgbpose_head.py b/tests/models/heads/test_rgbpose_head.py new file mode 100644 index 0000000000..919e02a4bd --- /dev/null +++ b/tests/models/heads/test_rgbpose_head.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from mmaction.models import RGBPoseHead + + +def test_rgbpose_head(): + """Test RGBPoseHead.""" + rgbpose_head = RGBPoseHead( + num_classes=4, + in_channels=[2048, 512], + dropout=dict(rgb=0.51, pose=0.49)) + rgbpose_head.init_weights() + + assert rgbpose_head.num_classes == 4 + assert rgbpose_head.dropout == dict(rgb=0.51, pose=0.49) + assert rgbpose_head.in_channels == [2048, 512] + assert rgbpose_head.init_std == 0.01 + + assert isinstance(rgbpose_head.dropout_rgb, nn.Dropout) + assert isinstance(rgbpose_head.dropout_pose, nn.Dropout) + assert rgbpose_head.dropout_rgb.p == rgbpose_head.dropout['rgb'] + assert rgbpose_head.dropout_pose.p == rgbpose_head.dropout['pose'] + + assert isinstance(rgbpose_head.fc_rgb, nn.Linear) + assert isinstance(rgbpose_head.fc_pose, nn.Linear) + assert rgbpose_head.fc_rgb.in_features == rgbpose_head.in_channels[0] + assert rgbpose_head.fc_rgb.out_features == rgbpose_head.num_classes + assert rgbpose_head.fc_pose.in_features == rgbpose_head.in_channels[1] + assert rgbpose_head.fc_pose.out_features == rgbpose_head.num_classes + + assert isinstance(rgbpose_head.avg_pool, nn.AdaptiveAvgPool3d) + assert rgbpose_head.avg_pool.output_size == (1, 1, 1) + + feat_rgb = torch.rand((2, 2048, 8, 7, 7)) + feat_pose = torch.rand((2, 512, 32, 7, 7)) + + cls_scores = rgbpose_head((feat_rgb, feat_pose)) + assert cls_scores['rgb'].shape == torch.Size([2, 4]) + assert cls_scores['pose'].shape == torch.Size([2, 4]) diff --git a/tools/data/skeleton/compress_nturgbd.py b/tools/data/skeleton/compress_nturgbd.py new file mode 100644 index 0000000000..b8639257c9 --- /dev/null +++ b/tools/data/skeleton/compress_nturgbd.py @@ -0,0 +1,42 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import multiprocessing as mp +import os +import os.path as osp +import subprocess + + +def get_shape(vid): + cmd = 'ffprobe -v error -select_streams v:0 -show_entries ' \ + 'stream=width,height -of csv=s=x:p=0 \"{}\"'.format(vid) + w, h = subprocess.check_output(cmd, shell=True).decode('utf-8').split('x') + return int(w), int(h) + + +def compress(src, dest, shape=None, target_size=540, fps=-1): + if shape is None: + shape = get_shape(src) + w, h = shape + scale_str = f'-vf scale=-2:{target_size}' if w >= h else \ + f'-vf scale={target_size}:-2' + fps_str = f'-r {fps}' if fps > 0 else '' + quality_str = '-q:v 1' + vcodec_str = '-c:v libx264' + cmd = f'ffmpeg -y -loglevel error -i {src} -threads 1 ' \ + f'{quality_str} {scale_str} {fps_str} {vcodec_str} {dest}' + os.system(cmd) + + +def compress_nturgbd(name): + src = name + dest = src.replace('nturgbd_raw', + 'nturgbd_videos').replace('_rgb.avi', '.mp4') + shape = (1920, 1080) + compress(src, dest, shape) + + +src_dir = 'data/nturgbd_raw' +tgt_dir = 'data/nturgbd_videos' +os.makedirs(tgt_dir, exist_ok=True) +files = [osp.join(src_dir, x) for x in os.listdir(src_dir) if '.avi' in x] +pool = mp.Pool(32) +pool.map(compress_nturgbd, files) From acb79e41c1d9288806a359ded943c949224465c3 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Wed, 1 Mar 2023 20:30:01 +0800 Subject: [PATCH 09/36] [fix] specify map_location to cpu when use _load_checkpoint (#2254) --- mmaction/models/backbones/resnet.py | 3 ++- mmaction/models/backbones/resnet3d.py | 2 +- mmaction/models/backbones/resnet3d_slowfast.py | 2 +- mmaction/models/backbones/timesformer.py | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mmaction/models/backbones/resnet.py b/mmaction/models/backbones/resnet.py index 0ebf6d61b0..c599bcc311 100644 --- a/mmaction/models/backbones/resnet.py +++ b/mmaction/models/backbones/resnet.py @@ -497,7 +497,8 @@ def _load_bn_params(bn: nn.Module, state_dict_tv: OrderedDict, def _load_torchvision_checkpoint(self, logger: mmengine.MMLogger = None) -> None: """Initiate the parameters from torchvision pretrained checkpoint.""" - state_dict_torchvision = _load_checkpoint(self.pretrained) + state_dict_torchvision = _load_checkpoint( + self.pretrained, map_location='cpu') if 'state_dict' in state_dict_torchvision: state_dict_torchvision = state_dict_torchvision['state_dict'] diff --git a/mmaction/models/backbones/resnet3d.py b/mmaction/models/backbones/resnet3d.py index cbaa4e18ca..63b32fc8cd 100644 --- a/mmaction/models/backbones/resnet3d.py +++ b/mmaction/models/backbones/resnet3d.py @@ -723,7 +723,7 @@ def _inflate_weights(self, logger: MMLogger) -> None: debugging information. """ - state_dict_r2d = _load_checkpoint(self.pretrained) + state_dict_r2d = _load_checkpoint(self.pretrained, map_location='cpu') if 'state_dict' in state_dict_r2d: state_dict_r2d = state_dict_r2d['state_dict'] diff --git a/mmaction/models/backbones/resnet3d_slowfast.py b/mmaction/models/backbones/resnet3d_slowfast.py index c4ca8b8032..3083239ff9 100644 --- a/mmaction/models/backbones/resnet3d_slowfast.py +++ b/mmaction/models/backbones/resnet3d_slowfast.py @@ -214,7 +214,7 @@ def inflate_weights(self, logger: MMLogger) -> None: debugging information. """ - state_dict_r2d = _load_checkpoint(self.pretrained) + state_dict_r2d = _load_checkpoint(self.pretrained, map_location='cpu') if 'state_dict' in state_dict_r2d: state_dict_r2d = state_dict_r2d['state_dict'] diff --git a/mmaction/models/backbones/timesformer.py b/mmaction/models/backbones/timesformer.py index 618b381295..af636b5198 100644 --- a/mmaction/models/backbones/timesformer.py +++ b/mmaction/models/backbones/timesformer.py @@ -235,7 +235,7 @@ def init_weights(self, pretrained=None): logger = MMLogger.get_current_instance() logger.info(f'load model from: {self.pretrained}') - state_dict = _load_checkpoint(self.pretrained) + state_dict = _load_checkpoint(self.pretrained, map_location='cpu') if 'state_dict' in state_dict: state_dict = state_dict['state_dict'] From 4e9b7ec3d2c241add5446d394b78e398ef407a76 Mon Sep 17 00:00:00 2001 From: Kai Hu Date: Mon, 6 Mar 2023 05:15:07 -0500 Subject: [PATCH 10/36] [Fix] fix command bugs in localization tasks' README (#2244) --- configs/localization/bmn/README.md | 4 ++-- configs/localization/bsn/README.md | 2 +- .../bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/configs/localization/bmn/README.md b/configs/localization/bmn/README.md index 834df03ad5..2f49330743 100644 --- a/configs/localization/bmn/README.md +++ b/configs/localization/bmn/README.md @@ -39,7 +39,7 @@ For more details on data preparation, you can refer to [ActivityNet Data Prepara Train BMN model on ActivityNet features dataset. ```shell -bash tools/dist_train.sh configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py 2 +bash tools/dist_train.sh configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py 2 ``` For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). @@ -49,7 +49,7 @@ For more details, you can refer to the **Training** part in the [Training and Te Test BMN on ActivityNet feature dataset. ```shell -python3 tools/test.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py CHECKPOINT.PTH +python3 tools/test.py configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py CHECKPOINT.PTH ``` For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). diff --git a/configs/localization/bsn/README.md b/configs/localization/bsn/README.md index 62c46f6782..efd2d2c0d0 100644 --- a/configs/localization/bsn/README.md +++ b/configs/localization/bsn/README.md @@ -42,7 +42,7 @@ python3 tools/train.py configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activi After training use the TEM module to generate the probabilities sequence (actionness, starting, and ending) for the training and validation dataset: ```shell -python tools/test.py configs/localization/bsn/bsn_tem_400x100_1xb16_20e_activitynet_feature.py \ +python tools/test.py configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py \ work_dirs/bsn_400x100_20e_1xb16_activitynet_feature/tem_epoch_20.pth ``` diff --git a/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py b/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py index 28595bb786..285306f976 100644 --- a/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py +++ b/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py @@ -89,3 +89,5 @@ metric_type='TEM', dump_config=dict(out=tem_results_dir, output_format='csv')) val_evaluator = test_evaluator + +default_hooks = dict(checkpoint=dict(filename_tmpl='tem_epoch_{}.pth')) From edd7dee3bcb4f604ffd0a38ac98f8c09176dde75 Mon Sep 17 00:00:00 2001 From: wxDai Date: Thu, 9 Mar 2023 02:50:25 +0800 Subject: [PATCH 11/36] [Project] Add Example project (#2265) --- .gitignore | 2 + docs/en/notes/contribution_guide.md | 7 +- projects/README.md | 17 +++ projects/example_project/README.md | 122 ++++++++++++++++++ ...1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py | 11 ++ projects/example_project/models/__init__.py | 3 + .../example_project/models/example_net.py | 21 +++ 7 files changed, 180 insertions(+), 3 deletions(-) create mode 100644 projects/README.md create mode 100644 projects/example_project/README.md create mode 100644 projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py create mode 100644 projects/example_project/models/__init__.py create mode 100644 projects/example_project/models/example_net.py diff --git a/.gitignore b/.gitignore index b2c1be8fa6..3e40ace4d5 100644 --- a/.gitignore +++ b/.gitignore @@ -113,6 +113,8 @@ venv.bak/ *.log.json benchlist.txt work_dirs/ +/projects/*/work_dirs +/projects/*/data .DS_Store # Pytorch diff --git a/docs/en/notes/contribution_guide.md b/docs/en/notes/contribution_guide.md index 92548868d2..f9d96c75a5 100644 --- a/docs/en/notes/contribution_guide.md +++ b/docs/en/notes/contribution_guide.md @@ -33,10 +33,11 @@ We use the following tools for linting and formatting: - [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files. - [docformatter](https://github.com/myint/docformatter): A formatter to format docstring. -Style configurations of yapf and isort can be found in [setup.cfg](../../../setup.cfg). +Style configurations of yapf and isort can be found in [setup.cfg](https://github.com/open-mmlab/mmaction2/blob/1.x/setup.cfg). -We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, fixes `end-of-files`, sorts `requirments.txt` automatically on every commit. -The config for a pre-commit hook is stored in [.pre-commit-config](../../../.pre-commit-config.yaml). +We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, +fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit. +The config for a pre-commit hook is stored in [.pre-commit-config](https://github.com/open-mmlab/mmaction2/blob/1.x/.pre-commit-config.yaml). After you clone the repository, you will need to install initialize pre-commit hook. diff --git a/projects/README.md b/projects/README.md new file mode 100644 index 0000000000..7e12abee97 --- /dev/null +++ b/projects/README.md @@ -0,0 +1,17 @@ +# Welcome to Projects of MMAction2 + +In this folder, we welcome all contribution of deep-learning video understanding models from community. + +Here, these requirements, e.g., code standards, are not that strict as in the core package. Thus, developers from the community can implement their algorithms much more easily and efficiently in MMAction2. We appreciate all contributions from community to make MMAction2 greater. + +Here is an [example project](./example_project) about how to add your algorithms easily. + +We also provide some documentation listed below: + +- [Contribution Guide](https://mmaction2.readthedocs.io/en/dev-1.x/notes/contribution_guide.html) + + The guides for new contributors about how to add your projects to MMAction2. + +- [Discussions](https://github.com/open-mmlab/mmaction2/discussions) + + Welcome to start discussion! diff --git a/projects/example_project/README.md b/projects/example_project/README.md new file mode 100644 index 0000000000..ef74fe9cbe --- /dev/null +++ b/projects/example_project/README.md @@ -0,0 +1,122 @@ +# Example Project + +This is an example README for community `projects/`. You can write your README in your own project. Here are +some recommended parts of a README for others to understand and use your project, you can copy or modify them +according to your project. + +## Usage + +### Setup Environment + +Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2. + +At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it. + +> Please run it every time after you opened a new shell. + +```shell +export PYTHONPATH=`pwd`:$PYTHONPATH +``` + +### Data Preparation + +Prepare the Kinetics400 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/kinetics/README.md). + +### Training commands + +**To train with single GPU:** + +```bash +mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py +``` + +**To train with multiple GPUs:** + +```bash +mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --launcher pytorch --gpus 8 +``` + +**To train with multiple GPUs by slurm:** + +```bash +mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +### Testing commands + +**To test with single GPU:** + +```bash +mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT +``` + +**To test with multiple GPUs:** + +```bash +mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8 +``` + +**To test with multiple GPUs by slurm:** + +```bash +mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +## Results + +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | config | ckpt | log | +| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :-----------------------------------------------------------------------------: | ----------------: | --------------: | +| 1x1x3 | 224x224 | 8 | ResNet50 | ImageNet | 72.83 | 90.65 | 25 clips x 10 crop | [config](./configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](CKPT-LINK) | [log](LOG-LINK) | + +## Citation + + + +```bibtex +@misc{2020mmaction2, + title={OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark}, + author={MMAction2 Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmaction2}}, + year={2020} +} +``` + +## Checklist + +Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects. + +- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [ ] Finish the code + + + + - [ ] Basic docstrings & proper citation + + + + - [ ] Converted checkpoint and results (Only for reproduction) + + + +- [ ] Milestone 2: Indicates a successful model implementation. + + - [ ] Training results + + + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Unit tests + + + + - [ ] Code style + + + + - [ ] `metafile.yml` and `README.md` + + diff --git a/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py b/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py new file mode 100644 index 0000000000..32ea002651 --- /dev/null +++ b/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py @@ -0,0 +1,11 @@ +# Directly inherit the entire recipe you want to use. +_base_ = 'mmaction::recognition/tsn/' \ + 'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py' + +# This line is to import your own modules. +custom_imports = dict(imports='models') + +# Modify the backbone to use your own backbone. +_base_['model']['backbone'] = dict(type='ExampleNet', depth=50) +# Modify the in_channels of classifier head to fit your backbone. +_base_['model']['cls_head']['in_channels'] = 2048 diff --git a/projects/example_project/models/__init__.py b/projects/example_project/models/__init__.py new file mode 100644 index 0000000000..e2d4f2f571 --- /dev/null +++ b/projects/example_project/models/__init__.py @@ -0,0 +1,3 @@ +from .example_net import ExampleNet + +__all__ = ['ExampleNet'] diff --git a/projects/example_project/models/example_net.py b/projects/example_project/models/example_net.py new file mode 100644 index 0000000000..6a3b8bbb06 --- /dev/null +++ b/projects/example_project/models/example_net.py @@ -0,0 +1,21 @@ +from mmaction.models import ResNet +from mmaction.registry import MODELS + + +# Register your model to the `MODELS`. +@MODELS.register_module() +class ExampleNet(ResNet): + """Implements an example backbone. + + Implement the backbone network just like a normal pytorch network. + """ + + def __init__(self, **kwargs) -> None: + print('#############################\n' + '# Hello MMAction2! #\n' + '#############################') + super().__init__(**kwargs) + + def forward(self, x): + """Defines the computation performed at every call.""" + return super().forward(x) From e38a41200e7700c3e57a39f7d1fc00149aff36da Mon Sep 17 00:00:00 2001 From: wxDai Date: Tue, 14 Mar 2023 03:16:36 +0800 Subject: [PATCH 12/36] [Project] Add MSG3D project (#2291) --- projects/msg3d/README.md | 143 ++++++++ ...6-joint-u100-80e_ntu60-xsub-keypoint-2d.py | 104 ++++++ ...6-joint-u100-80e_ntu60-xsub-keypoint-3d.py | 104 ++++++ projects/msg3d/models/__init__.py | 3 + projects/msg3d/models/msg3d.py | 75 ++++ projects/msg3d/models/msg3d_utils.py | 342 ++++++++++++++++++ 6 files changed, 771 insertions(+) create mode 100644 projects/msg3d/README.md create mode 100644 projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py create mode 100644 projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py create mode 100644 projects/msg3d/models/__init__.py create mode 100644 projects/msg3d/models/msg3d.py create mode 100644 projects/msg3d/models/msg3d_utils.py diff --git a/projects/msg3d/README.md b/projects/msg3d/README.md new file mode 100644 index 0000000000..7c784f90aa --- /dev/null +++ b/projects/msg3d/README.md @@ -0,0 +1,143 @@ +# MSG3D Project + +[Disentangling and Unifying Graph Convolutions for Skeleton-Based Action Recognition](https://arxiv.org/abs/2003.14111) + + + +## Abstract + + + +Spatial-temporal graphs have been widely used by skeleton-based action recognition algorithms to model human action dynamics. To capture robust movement patterns from these graphs, long-range and multi-scale context aggregation and spatial-temporal dependency modeling are critical aspects of a powerful feature extractor. However, existing methods have limitations in achieving (1) unbiased long-range joint relationship modeling under multi-scale operators and (2) unobstructed cross-spacetime information flow for capturing complex spatial-temporal dependencies. In this work, we present (1) a simple method to disentangle multi-scale graph convolutions and (2) a unified spatial-temporal graph convolutional operator named G3D. The proposed multi-scale aggregation scheme disentangles the importance of nodes in different neighborhoods for effective long-range modeling. The proposed G3D module leverages dense cross-spacetime edges as skip connections for direct information propagation across the spatial-temporal graph. By coupling these proposals, we develop a powerful feature extractor named MS-G3D based on which our model outperforms previous state-of-the-art methods on three large-scale datasets: NTU RGB+D 60, NTU RGB+D 120, and Kinetics Skeleton 400. + + + +
+ +
+ +## Usage + +### Setup Environment + +Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2. + +At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it. + +> Please run it every time after you opened a new shell. + +```shell +export PYTHONPATH=`pwd`:$PYTHONPATH +``` + +### Data Preparation + +Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md). + +### Training commands + +**To train with single GPU:** + +```bash +mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py +``` + +**To train with multiple GPUs:** + +```bash +mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher pytorch --gpus 8 +``` + +**To train with multiple GPUs by slurm:** + +```bash +mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +### Testing commands + +**To test with single GPU:** + +```bash +mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT +``` + +**To test with multiple GPUs:** + +```bash +mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8 +``` + +**To test with multiple GPUs by slurm:** + +```bash +mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +## Results + +### NTU60_XSub_2D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log | +| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: | +| uniform 100 | joint | 8 | MSG3D | 92.3 | 10 clips | [config](./configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20230309-73b97296.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) | + +### NTU60_XSub_3D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log | +| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: | +| uniform 100 | joint | 8 | MSG3D | 89.6 | 10 clips | [config](./configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20230308-c325d222.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) | + +## Citation + + + +```bibtex +@inproceedings{liu2020disentangling, + title={Disentangling and unifying graph convolutions for skeleton-based action recognition}, + author={Liu, Ziyu and Zhang, Hongwen and Chen, Zhenghao and Wang, Zhiyong and Ouyang, Wanli}, + booktitle={CVPR}, + pages={143--152}, + year={2020} +} +``` + +## Checklist + +Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects. + +- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [x] Finish the code + + + + - [x] Basic docstrings & proper citation + + + + - [x] Converted checkpoint and results (Only for reproduction) + + + +- [x] Milestone 2: Indicates a successful model implementation. + + - [x] Training results + + + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Unit tests + + + + - [ ] Code style + + + + - [ ] `metafile.yml` and `README.md` + + diff --git a/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000..ece30dc019 --- /dev/null +++ b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,104 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='MSG3D', graph_cfg=dict(layout='coco', mode='binary_adj')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=384)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000..290fda984d --- /dev/null +++ b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,104 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='MSG3D', graph_cfg=dict(layout='nturgb+d', mode='binary_adj')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=384)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/projects/msg3d/models/__init__.py b/projects/msg3d/models/__init__.py new file mode 100644 index 0000000000..82b4a3085c --- /dev/null +++ b/projects/msg3d/models/__init__.py @@ -0,0 +1,3 @@ +from .msg3d import MSG3D + +__all__ = ['MSG3D'] diff --git a/projects/msg3d/models/msg3d.py b/projects/msg3d/models/msg3d.py new file mode 100644 index 0000000000..e4124a3435 --- /dev/null +++ b/projects/msg3d/models/msg3d.py @@ -0,0 +1,75 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import BaseModule, Sequential + +from mmaction.models.utils import Graph +from mmaction.registry import MODELS +from .msg3d_utils import MSGCN, MSTCN, MW_MSG3DBlock + + +@MODELS.register_module() +class MSG3D(BaseModule): + + def __init__(self, + graph_cfg, + in_channels=3, + base_channels=96, + num_gcn_scales=13, + num_g3d_scales=6, + num_person=2, + tcn_dropout=0): + super().__init__() + + self.graph = Graph(**graph_cfg) + # Note that A is a 2D tensor + A = torch.tensor( + self.graph.A[0], dtype=torch.float32, requires_grad=False) + self.register_buffer('A', A) + self.num_point = A.shape[-1] + self.in_channels = in_channels + self.base_channels = base_channels + + self.data_bn = nn.BatchNorm1d(self.num_point * in_channels * + num_person) + c1, c2, c3 = base_channels, base_channels * 2, base_channels * 4 + + # r=3 STGC blocks + self.gcn3d1 = MW_MSG3DBlock(3, c1, A, num_g3d_scales, window_stride=1) + self.sgcn1 = Sequential( + MSGCN(num_gcn_scales, 3, c1, A), MSTCN(c1, c1), MSTCN(c1, c1)) + self.sgcn1[-1].act = nn.Identity() + self.tcn1 = MSTCN(c1, c1, tcn_dropout=tcn_dropout) + + self.gcn3d2 = MW_MSG3DBlock(c1, c2, A, num_g3d_scales, window_stride=2) + self.sgcn2 = Sequential( + MSGCN(num_gcn_scales, c1, c1, A), MSTCN(c1, c2, stride=2), + MSTCN(c2, c2)) + self.sgcn2[-1].act = nn.Identity() + self.tcn2 = MSTCN(c2, c2, tcn_dropout=tcn_dropout) + + self.gcn3d3 = MW_MSG3DBlock(c2, c3, A, num_g3d_scales, window_stride=2) + self.sgcn3 = Sequential( + MSGCN(num_gcn_scales, c2, c2, A), MSTCN(c2, c3, stride=2), + MSTCN(c3, c3)) + self.sgcn3[-1].act = nn.Identity() + self.tcn3 = MSTCN(c3, c3, tcn_dropout=tcn_dropout) + + def forward(self, x): + N, M, T, V, C = x.size() + x = x.permute(0, 1, 3, 4, 2).contiguous().reshape(N, M * V * C, T) + x = self.data_bn(x) + x = x.reshape(N * M, V, C, T).permute(0, 2, 3, 1).contiguous() + + # Apply activation to the sum of the pathways + x = F.relu(self.sgcn1(x) + self.gcn3d1(x), inplace=True) + x = self.tcn1(x) + + x = F.relu(self.sgcn2(x) + self.gcn3d2(x), inplace=True) + x = self.tcn2(x) + + x = F.relu(self.sgcn3(x) + self.gcn3d3(x), inplace=True) + x = self.tcn3(x) + + # N * M, C, T, V + return x.reshape((N, M) + x.shape[1:]) diff --git a/projects/msg3d/models/msg3d_utils.py b/projects/msg3d/models/msg3d_utils.py new file mode 100644 index 0000000000..25b4f953b6 --- /dev/null +++ b/projects/msg3d/models/msg3d_utils.py @@ -0,0 +1,342 @@ +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import build_activation_layer +from mmengine.model import BaseModule, ModuleList, Sequential + +from mmaction.models.utils import unit_tcn +from mmaction.models.utils.graph import k_adjacency, normalize_digraph + + +class MLP(BaseModule): + + def __init__(self, + in_channels, + out_channels, + act_cfg=dict(type='ReLU'), + dropout=0): + super().__init__() + channels = [in_channels] + out_channels + self.layers = ModuleList() + for i in range(1, len(channels)): + if dropout > 1e-3: + self.layers.append(nn.Dropout(p=dropout)) + self.layers.append( + nn.Conv2d(channels[i - 1], channels[i], kernel_size=1)) + self.layers.append(nn.BatchNorm2d(channels[i])) + if act_cfg: + self.layers.append(build_activation_layer(act_cfg)) + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + + +class MSGCN(BaseModule): + + def __init__(self, + num_scales, + in_channels, + out_channels, + A, + dropout=0, + act_cfg=dict(type='ReLU')): + super().__init__() + self.num_scales = num_scales + + A_powers = [ + k_adjacency(A, k, with_self=True) for k in range(num_scales) + ] + A_powers = np.stack([normalize_digraph(g) for g in A_powers]) + + # K, V, V + self.register_buffer('A', torch.Tensor(A_powers)) + self.PA = nn.Parameter(self.A.clone()) + nn.init.uniform_(self.PA, -1e-6, 1e-6) + + self.mlp = MLP( + in_channels * num_scales, [out_channels], + dropout=dropout, + act_cfg=act_cfg) + + def forward(self, x): + N, C, T, V = x.shape + A = self.A + A = A + self.PA + + support = torch.einsum('kvu,nctv->nkctu', A, x) + support = support.reshape(N, self.num_scales * C, T, V) + out = self.mlp(support) + return out + + +# ! Notice: The implementation of MSTCN in +# MS-G3D is not the same as our implementation. +class MSTCN(BaseModule): + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilations=[1, 2, 3, 4], + residual=True, + act_cfg=dict(type='ReLU'), + init_cfg=[ + dict(type='Constant', layer='BatchNorm2d', val=1), + dict(type='Kaiming', layer='Conv2d', mode='fan_out') + ], + tcn_dropout=0): + + super().__init__(init_cfg=init_cfg) + # Multiple branches of temporal convolution + self.num_branches = len(dilations) + 2 + branch_channels = out_channels // self.num_branches + branch_channels_rem = out_channels - branch_channels * ( + self.num_branches - 1) + + if type(kernel_size) == list: + assert len(kernel_size) == len(dilations) + else: + kernel_size = [kernel_size] * len(dilations) + + self.branches = ModuleList([ + Sequential( + nn.Conv2d( + in_channels, branch_channels, kernel_size=1, padding=0), + nn.BatchNorm2d(branch_channels), + build_activation_layer(act_cfg), + unit_tcn( + branch_channels, + branch_channels, + kernel_size=ks, + stride=stride, + dilation=dilation), + ) for ks, dilation in zip(kernel_size, dilations) + ]) + + # Additional Max & 1x1 branch + self.branches.append( + Sequential( + nn.Conv2d( + in_channels, branch_channels, kernel_size=1, padding=0), + nn.BatchNorm2d(branch_channels), + build_activation_layer(act_cfg), + nn.MaxPool2d( + kernel_size=(3, 1), stride=(stride, 1), padding=(1, 0)), + nn.BatchNorm2d(branch_channels))) + + self.branches.append( + Sequential( + nn.Conv2d( + in_channels, + branch_channels_rem, + kernel_size=1, + padding=0, + stride=(stride, 1)), nn.BatchNorm2d(branch_channels_rem))) + + # Residual connection + if not residual: + self.residual = lambda x: 0 + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + else: + self.residual = unit_tcn( + in_channels, out_channels, kernel_size=1, stride=stride) + + self.act = build_activation_layer(act_cfg) + self.drop = nn.Dropout(tcn_dropout) + + def forward(self, x): + # Input dim: (N,C,T,V) + res = self.residual(x) + branch_outs = [] + for tempconv in self.branches: + out = tempconv(x) + branch_outs.append(out) + + out = torch.cat(branch_outs, dim=1) + out += res + out = self.act(out) + out = self.drop(out) + return out + + +class UnfoldTemporalWindows(BaseModule): + + def __init__(self, window_size, window_stride, window_dilation=1): + super().__init__() + self.window_size = window_size + self.window_stride = window_stride + self.window_dilation = window_dilation + + self.padding = (window_size + (window_size - 1) * + (window_dilation - 1) - 1) // 2 + self.unfold = nn.Unfold( + kernel_size=(self.window_size, 1), + dilation=(self.window_dilation, 1), + stride=(self.window_stride, 1), + padding=(self.padding, 0)) + + def forward(self, x): + # Input shape: (N,C,T,V), out: (N,C,T,V*window_size) + N, C, T, V = x.shape + x = self.unfold(x) + # Permute extra channels from window size to the graph dimension; + # -1 for number of windows + x = x.reshape(N, C, self.window_size, -1, V).permute(0, 1, 3, 2, + 4).contiguous() + x = x.reshape(N, C, -1, self.window_size * V) + return x + + +class ST_MSGCN(BaseModule): + + def __init__(self, + in_channels, + out_channels, + A, + num_scales, + window_size, + residual=False, + dropout=0, + act_cfg=dict(type='ReLU')): + + super().__init__() + self.num_scales = num_scales + self.window_size = window_size + A = self.build_st_graph(A, window_size) + + A_scales = [ + k_adjacency(A, k, with_self=True) for k in range(num_scales) + ] + A_scales = np.stack([normalize_digraph(g) for g in A_scales]) + + self.register_buffer('A', torch.Tensor(A_scales)) + self.V = len(A) + + self.PA = nn.Parameter(self.A.clone()) + nn.init.uniform_(self.PA, -1e-6, 1e-6) + + self.mlp = MLP( + in_channels * num_scales, [out_channels], + dropout=dropout, + act_cfg=act_cfg) + + # Residual connection + if not residual: + self.residual = lambda x: 0 + elif (in_channels == out_channels): + self.residual = lambda x: x + else: + self.residual = MLP(in_channels, [out_channels], act_cfg=None) + + self.act = build_activation_layer(act_cfg) + + def build_st_graph(self, A, window_size): + if not isinstance(A, np.ndarray): + A = A.data.cpu().numpy() + + assert len(A.shape) == 2 and A.shape[0] == A.shape[1] + V = len(A) + A_with_I = A + np.eye(V, dtype=A.dtype) + + A_large = np.tile(A_with_I, (window_size, window_size)).copy() + return A_large + + def forward(self, x): + N, C, T, V = x.shape # T = number of windows, V = self.V * window_size + A = self.A + self.PA + + # Perform Graph Convolution + res = self.residual(x) + agg = torch.einsum('kvu,nctv->nkctu', A, x) + agg = agg.reshape(N, self.num_scales * C, T, V) + out = self.mlp(agg) + if res == 0: + return self.act(out) + else: + return self.act(out + res) + + +class MSG3DBlock(BaseModule): + + def __init__(self, + in_channels, + out_channels, + A, + num_scales, + window_size, + window_stride, + window_dilation, + embed_factor=1, + activation='relu'): + + super().__init__() + self.window_size = window_size + self.out_channels = out_channels + self.embed_channels_in = out_channels // embed_factor + self.embed_channels_out = out_channels // embed_factor + if embed_factor == 1: + self.in1x1 = nn.Identity() + self.embed_channels_in = self.embed_channels_out = in_channels + # The first STGC block changes channels right away; + # others change at collapse + if in_channels == 3: + self.embed_channels_out = out_channels + else: + self.in1x1 = MLP(in_channels, [self.embed_channels_in]) + + self.gcn3d = Sequential( + UnfoldTemporalWindows(window_size, window_stride, window_dilation), + ST_MSGCN( + in_channels=self.embed_channels_in, + out_channels=self.embed_channels_out, + A=A, + num_scales=num_scales, + window_size=window_size)) + + self.out_conv = nn.Conv3d( + self.embed_channels_out, + out_channels, + kernel_size=(1, self.window_size, 1)) + self.out_bn = nn.BatchNorm2d(out_channels) + + def forward(self, x): + N, _, T, V = x.shape + x = self.in1x1(x) + # Construct temporal windows and apply MS-GCN + x = self.gcn3d(x) + + # Collapse the window dimension + x = x.reshape(N, self.embed_channels_out, -1, self.window_size, V) + x = self.out_conv(x).squeeze(dim=3) + x = self.out_bn(x) + # no activation + return x + + +class MW_MSG3DBlock(BaseModule): + + def __init__(self, + in_channels, + out_channels, + A, + num_scales, + window_sizes=[3, 5], + window_stride=1, + window_dilations=[1, 1]): + + super().__init__() + self.gcn3d = ModuleList([ + MSG3DBlock(in_channels, out_channels, A, num_scales, window_size, + window_stride, window_dilation) for window_size, + window_dilation in zip(window_sizes, window_dilations) + ]) + + def forward(self, x): + out_sum = 0 + for gcn3d in self.gcn3d: + out_sum += gcn3d(x) + return out_sum From b292e0ddaa16648c1c394d2110968cb8b8d0d405 Mon Sep 17 00:00:00 2001 From: wxDai Date: Tue, 14 Mar 2023 10:40:23 +0800 Subject: [PATCH 13/36] Add CTRGCN project (#2269) --- projects/ctrgcn/README.md | 143 +++++++++++++ ...6-joint-u100-80e_ntu60-xsub-keypoint-2d.py | 104 ++++++++++ ...6-joint-u100-80e_ntu60-xsub-keypoint-3d.py | 104 ++++++++++ projects/ctrgcn/models/__init__.py | 3 + projects/ctrgcn/models/ctrgcn.py | 104 ++++++++++ projects/ctrgcn/models/ctrgcn_utils.py | 192 ++++++++++++++++++ 6 files changed, 650 insertions(+) create mode 100644 projects/ctrgcn/README.md create mode 100644 projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py create mode 100644 projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py create mode 100644 projects/ctrgcn/models/__init__.py create mode 100644 projects/ctrgcn/models/ctrgcn.py create mode 100644 projects/ctrgcn/models/ctrgcn_utils.py diff --git a/projects/ctrgcn/README.md b/projects/ctrgcn/README.md new file mode 100644 index 0000000000..809af449f5 --- /dev/null +++ b/projects/ctrgcn/README.md @@ -0,0 +1,143 @@ +# CTRGCN Project + +[Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition](https://arxiv.org/abs/2107.12213) + + + +## Abstract + + + +Graph convolutional networks (GCNs) have been widely used and achieved remarkable results in skeleton-based action recognition. In GCNs, graph topology dominates feature aggregation and therefore is the key to extracting representative features. In this work, we propose a novel Channel-wise Topology Refinement Graph Convolution (CTR-GC) to dynamically learn different topologies and effectively aggregate joint features in different channels for skeleton-based action recognition. The proposed CTR-GC models channel-wise topologies through learning a shared topology as a generic prior for all channels and refining it with channel-specific correlations for each channel. Our refinement method introduces few extra parameters and significantly reduces the difficulty of modeling channel-wise topologies. Furthermore, via reformulating graph convolutions into a unified form, we find that CTR-GC relaxes strict constraints of graph convolutions, leading to stronger representation capability. Combining CTR-GC with temporal modeling modules, we develop a powerful graph convolutional network named CTR-GCN which notably outperforms state-of-the-art methods on the NTU RGB+D, NTU RGB+D 120, and NW-UCLA datasets. + + + +
+ +
+ +## Usage + +### Setup Environment + +Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2. + +At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it. + +> Please run it every time after you opened a new shell. + +```shell +export PYTHONPATH=`pwd`:$PYTHONPATH +``` + +### Data Preparation + +Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md). + +### Training commands + +**To train with single GPU:** + +```bash +mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py +``` + +**To train with multiple GPUs:** + +```bash +mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher pytorch --gpus 8 +``` + +**To train with multiple GPUs by slurm:** + +```bash +mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +### Testing commands + +**To test with single GPU:** + +```bash +mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT +``` + +**To test with multiple GPUs:** + +```bash +mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8 +``` + +**To test with multiple GPUs by slurm:** + +```bash +mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher slurm \ + --gpus 8 --gpus-per-node 8 --partition $PARTITION +``` + +## Results + +### NTU60_XSub_2D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log | +| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: | +| uniform 100 | joint | 8 | CTRGCN | 89.6 | 10 clips | [config](./configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20230308-7aba454e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) | + +### NTU60_XSub_3D + +| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log | +| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: | +| uniform 100 | joint | 8 | CTRGCN | 89.0 | 10 clips | [config](./configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20230308-950dca0a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) | + +## Citation + + + +```bibtex +@inproceedings{chen2021channel, + title={Channel-wise topology refinement graph convolution for skeleton-based action recognition}, + author={Chen, Yuxin and Zhang, Ziqi and Yuan, Chunfeng and Li, Bing and Deng, Ying and Hu, Weiming}, + booktitle={CVPR}, + pages={13359--13368}, + year={2021} +} +``` + +## Checklist + +Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects. + +- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`. + + - [x] Finish the code + + + + - [x] Basic docstrings & proper citation + + + + - [x] Converted checkpoint and results (Only for reproduction) + + + +- [x] Milestone 2: Indicates a successful model implementation. + + - [x] Training results + + + +- [ ] Milestone 3: Good to be a part of our core package! + + - [ ] Unit tests + + + + - [ ] Code style + + + + - [ ] `metafile.yml` and `README.md` + + diff --git a/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py new file mode 100644 index 0000000000..4dd8629837 --- /dev/null +++ b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py @@ -0,0 +1,104 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='CTRGCN', graph_cfg=dict(layout='coco', mode='spatial')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_2d.pkl' +train_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize2D'), + dict(type='GenSkeFeat', dataset='coco', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py new file mode 100644 index 0000000000..7ae499b4ce --- /dev/null +++ b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py @@ -0,0 +1,104 @@ +_base_ = 'mmaction::_base_/default_runtime.py' + +custom_imports = dict(imports='models') + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='CTRGCN', graph_cfg=dict(layout='nturgb+d', mode='spatial')), + cls_head=dict(type='GCNHead', num_classes=60, in_channels=256)) + +dataset_type = 'PoseDataset' +ann_file = 'data/skeleton/ntu60_3d.pkl' +train_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict(type='UniformSampleFrames', clip_len=100), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='PreNormalize3D'), + dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']), + dict( + type='UniformSampleFrames', clip_len=100, num_clips=10, + test_mode=True), + dict(type='PoseDecode'), + dict(type='FormatGCNInput', num_person=2), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=train_pipeline, + split='xsub_train'))) +val_dataloader = dict( + batch_size=16, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=val_pipeline, + split='xsub_val', + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file, + pipeline=test_pipeline, + split='xsub_val', + test_mode=True)) + +val_evaluator = [dict(type='AccMetric')] +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='CosineAnnealingLR', + eta_min=0, + T_max=16, + by_epoch=True, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict( + type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True)) + +default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/projects/ctrgcn/models/__init__.py b/projects/ctrgcn/models/__init__.py new file mode 100644 index 0000000000..71958fdd44 --- /dev/null +++ b/projects/ctrgcn/models/__init__.py @@ -0,0 +1,3 @@ +from .ctrgcn import CTRGCN + +__all__ = ['CTRGCN'] diff --git a/projects/ctrgcn/models/ctrgcn.py b/projects/ctrgcn/models/ctrgcn.py new file mode 100644 index 0000000000..c6056071ea --- /dev/null +++ b/projects/ctrgcn/models/ctrgcn.py @@ -0,0 +1,104 @@ +import torch +import torch.nn as nn +from mmengine.model import BaseModule, ModuleList + +from mmaction.models.utils import Graph, unit_tcn +from mmaction.registry import MODELS +from .ctrgcn_utils import MSTCN, unit_ctrgcn + + +class CTRGCNBlock(BaseModule): + + def __init__(self, + in_channels, + out_channels, + A, + stride=1, + residual=True, + kernel_size=5, + dilations=[1, 2], + tcn_dropout=0): + super(CTRGCNBlock, self).__init__() + self.gcn1 = unit_ctrgcn(in_channels, out_channels, A) + self.tcn1 = MSTCN( + out_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + dilations=dilations, + residual=False, + tcn_dropout=tcn_dropout) + self.relu = nn.ReLU(inplace=True) + if not residual: + self.residual = lambda x: 0 + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + else: + self.residual = unit_tcn( + in_channels, out_channels, kernel_size=1, stride=stride) + + def forward(self, x): + y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x)) + return y + + +@MODELS.register_module() +class CTRGCN(BaseModule): + + def __init__(self, + graph_cfg, + in_channels=3, + base_channels=64, + num_stages=10, + inflate_stages=[5, 8], + down_stages=[5, 8], + pretrained=None, + num_person=2, + **kwargs): + super(CTRGCN, self).__init__() + + self.graph = Graph(**graph_cfg) + A = torch.tensor( + self.graph.A, dtype=torch.float32, requires_grad=False) + self.register_buffer('A', A) + + self.num_person = num_person + self.base_channels = base_channels + + self.data_bn = nn.BatchNorm1d(num_person * in_channels * A.size(1)) + + kwargs0 = {k: v for k, v in kwargs.items() if k != 'tcn_dropout'} + modules = [ + CTRGCNBlock( + in_channels, + base_channels, + A.clone(), + residual=False, + **kwargs0) + ] + for i in range(2, num_stages + 1): + in_channels = base_channels + out_channels = base_channels * (1 + (i in inflate_stages)) + stride = 1 + (i in down_stages) + modules.append( + CTRGCNBlock( + base_channels, + out_channels, + A.clone(), + stride=stride, + **kwargs)) + base_channels = out_channels + self.net = ModuleList(modules) + + def forward(self, x): + N, M, T, V, C = x.size() + x = x.permute(0, 1, 3, 4, 2).contiguous() + x = self.data_bn(x.view(N, M * V * C, T)) + x = x.view(N, M, V, C, T).permute(0, 1, 3, 4, + 2).contiguous().view(N * M, C, T, V) + + for gcn in self.net: + x = gcn(x) + + x = x.reshape((N, M) + x.shape[1:]) + return x diff --git a/projects/ctrgcn/models/ctrgcn_utils.py b/projects/ctrgcn/models/ctrgcn_utils.py new file mode 100644 index 0000000000..52665e8567 --- /dev/null +++ b/projects/ctrgcn/models/ctrgcn_utils.py @@ -0,0 +1,192 @@ +import torch +import torch.nn as nn +from mmcv.cnn import build_activation_layer +from mmengine.model import BaseModule, ModuleList, Sequential + +from mmaction.models.utils import unit_tcn + + +# ! Notice: The implementation of MSTCN in +# MS-G3D is not the same as our implementation. +class MSTCN(BaseModule): + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + dilations=[1, 2, 3, 4], + residual=True, + act_cfg=dict(type='ReLU'), + init_cfg=[ + dict(type='Constant', layer='BatchNorm2d', val=1), + dict(type='Kaiming', layer='Conv2d', mode='fan_out') + ], + tcn_dropout=0): + + super().__init__(init_cfg=init_cfg) + # Multiple branches of temporal convolution + self.num_branches = len(dilations) + 2 + branch_channels = out_channels // self.num_branches + branch_channels_rem = out_channels - branch_channels * ( + self.num_branches - 1) + + if type(kernel_size) == list: + assert len(kernel_size) == len(dilations) + else: + kernel_size = [kernel_size] * len(dilations) + + self.branches = ModuleList([ + Sequential( + nn.Conv2d( + in_channels, branch_channels, kernel_size=1, padding=0), + nn.BatchNorm2d(branch_channels), + build_activation_layer(act_cfg), + unit_tcn( + branch_channels, + branch_channels, + kernel_size=ks, + stride=stride, + dilation=dilation), + ) for ks, dilation in zip(kernel_size, dilations) + ]) + + # Additional Max & 1x1 branch + self.branches.append( + Sequential( + nn.Conv2d( + in_channels, branch_channels, kernel_size=1, padding=0), + nn.BatchNorm2d(branch_channels), + build_activation_layer(act_cfg), + nn.MaxPool2d( + kernel_size=(3, 1), stride=(stride, 1), padding=(1, 0)), + nn.BatchNorm2d(branch_channels))) + + self.branches.append( + Sequential( + nn.Conv2d( + in_channels, + branch_channels_rem, + kernel_size=1, + padding=0, + stride=(stride, 1)), nn.BatchNorm2d(branch_channels_rem))) + + # Residual connection + if not residual: + self.residual = lambda x: 0 + elif (in_channels == out_channels) and (stride == 1): + self.residual = lambda x: x + else: + self.residual = unit_tcn( + in_channels, out_channels, kernel_size=1, stride=stride) + + self.act = build_activation_layer(act_cfg) + self.drop = nn.Dropout(tcn_dropout) + + def forward(self, x): + # Input dim: (N,C,T,V) + res = self.residual(x) + branch_outs = [] + for tempconv in self.branches: + out = tempconv(x) + branch_outs.append(out) + + out = torch.cat(branch_outs, dim=1) + out += res + out = self.act(out) + out = self.drop(out) + return out + + +class CTRGC(BaseModule): + + def __init__(self, + in_channels, + out_channels, + rel_reduction=8, + init_cfg=[ + dict(type='Constant', layer='BatchNorm2d', val=1), + dict(type='Kaiming', layer='Conv2d', mode='fan_out') + ]): + super(CTRGC, self).__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + if in_channels <= 16: + self.rel_channels = 8 + else: + self.rel_channels = in_channels // rel_reduction + self.conv1 = nn.Conv2d( + self.in_channels, self.rel_channels, kernel_size=1) + self.conv2 = nn.Conv2d( + self.in_channels, self.rel_channels, kernel_size=1) + self.conv3 = nn.Conv2d( + self.in_channels, self.out_channels, kernel_size=1) + self.conv4 = nn.Conv2d( + self.rel_channels, self.out_channels, kernel_size=1) + self.tanh = nn.Tanh() + + def forward(self, x, A=None, alpha=1): + # Input: N, C, T, V + x1, x2, x3 = self.conv1(x).mean(-2), self.conv2(x).mean( + -2), self.conv3(x) + # X1, X2: N, R, V + # N, R, V, 1 - N, R, 1, V + x1 = self.tanh(x1.unsqueeze(-1) - x2.unsqueeze(-2)) + # N, R, V, V + x1 = self.conv4(x1) * alpha + (A[None, None] if A is not None else 0 + ) # N,C,V,V + x1 = torch.einsum('ncuv,nctu->nctv', x1, x3) + return x1 + + +class unit_ctrgcn(BaseModule): + + def __init__(self, + in_channels, + out_channels, + A, + init_cfg=[ + dict( + type='Constant', + layer='BatchNorm2d', + val=1, + override=dict(type='Constant', name='bn', val=1e-6)), + dict(type='Kaiming', layer='Conv2d', mode='fan_out') + ]): + + super(unit_ctrgcn, self).__init__(init_cfg=init_cfg) + inter_channels = out_channels // 4 + self.inter_c = inter_channels + self.out_c = out_channels + self.in_c = in_channels + + self.num_subset = A.shape[0] + self.convs = ModuleList() + + for i in range(self.num_subset): + self.convs.append(CTRGC(in_channels, out_channels)) + + if in_channels != out_channels: + self.down = Sequential( + nn.Conv2d(in_channels, out_channels, 1), + nn.BatchNorm2d(out_channels)) + else: + self.down = lambda x: x + + self.A = nn.Parameter(A.clone()) + + self.alpha = nn.Parameter(torch.zeros(1)) + self.bn = nn.BatchNorm2d(out_channels) + self.soft = nn.Softmax(-2) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + y = None + + for i in range(self.num_subset): + z = self.convs[i](x, self.A[i], self.alpha) + y = z + y if y is not None else z + + y = self.bn(y) + y += self.down(x) + return self.relu(y) From 8c76fbd6eb275df4595561e288ba491a09a2806d Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Tue, 14 Mar 2023 10:57:41 +0800 Subject: [PATCH 14/36] [fix] fix ntu_pose_extraction (#2246) --- ...er-rcnn_r50-caffe_fpn_ms-1x_coco-person.py | 140 ++++++++++++++++++ mmaction/apis/inference.py | 17 ++- mmaction/utils/misc.py | 2 +- tools/data/skeleton/README.md | 12 +- tools/data/skeleton/README_zh-CN.md | 23 ++- tools/data/skeleton/ntu_pose_extraction.py | 121 ++++----------- 6 files changed, 207 insertions(+), 108 deletions(-) create mode 100644 demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py diff --git a/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py new file mode 100644 index 0000000000..934a3a5bc4 --- /dev/null +++ b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py @@ -0,0 +1,140 @@ +# Copyright (c) OpenMMLab. All rights reserved. +model = dict( + type='FasterRCNN', + _scope_='mmdet', + data_preprocessor=dict( + type='DetDataPreprocessor', + mean=[103.53, 116.28, 123.675], + std=[1.0, 1.0, 1.0], + bgr_to_rgb=False, + pad_size_divisor=32), + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe', + init_cfg=dict( + type='Pretrained', + checkpoint='open-mmlab://detectron2/resnet50_caffe')), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=1, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0.0, 0.0, 0.0, 0.0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +file_client_args = dict(backend='disk') + +test_pipeline = [ + dict(type='mmdet.LoadImageFromFile', file_client_args=file_client_args), + dict(type='mmdet.Resize', scale=(1333, 800), keep_ratio=True), + dict( + type='mmdet.PackDetInputs', + meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', + 'scale_factor')) +] + +test_dataloader = dict( + batch_size=1, + num_workers=2, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type='CocoDataset', + data_root='data/coco/', + ann_file='annotations/instances_val2017.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=test_pipeline, + metainfo=dict(classes=('person', ), palette=[(220, 20, 60)]))) diff --git a/mmaction/apis/inference.py b/mmaction/apis/inference.py index ac014d0350..d0a4c01501 100644 --- a/mmaction/apis/inference.py +++ b/mmaction/apis/inference.py @@ -104,7 +104,8 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config], frame_paths: List[str], det_score_thr: float = 0.9, det_cat_id: int = 0, - device: Union[str, torch.device] = 'cuda:0') -> tuple: + device: Union[str, torch.device] = 'cuda:0', + with_score: bool = False) -> tuple: """Detect human boxes given frame paths. Args: @@ -117,6 +118,8 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config], det_cat_id (int): The category id for human detection. Defaults to 0. device (Union[str, torch.device]): The desired device of returned tensor. Defaults to ``'cuda:0'``. + with_score (bool): Whether to append detection score after box. + Defaults to None. Returns: List[np.ndarray]: List of detected human boxes. @@ -141,10 +144,16 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config], det_data_sample: DetDataSample = inference_detector(model, frame_path) pred_instance = det_data_sample.pred_instances.cpu().numpy() bboxes = pred_instance.bboxes + scores = pred_instance.scores # We only keep human detection bboxs with score larger # than `det_score_thr` and category id equal to `det_cat_id`. - bboxes = bboxes[np.logical_and(pred_instance.labels == det_cat_id, - pred_instance.scores > det_score_thr)] + valid_idx = np.logical_and(pred_instance.labels == det_cat_id, + pred_instance.scores > det_score_thr) + bboxes = bboxes[valid_idx] + scores = scores[valid_idx] + + if with_score: + bboxes = np.concatenate((bboxes, scores[:, None]), axis=-1) results.append(bboxes) data_samples.append(det_data_sample) @@ -187,7 +196,7 @@ def pose_inference(pose_config: Union[str, Path, mmengine.Config], print('Performing Human Pose Estimation for each frame') for f, d in track_iter_progress(list(zip(frame_paths, det_results))): pose_data_samples: List[PoseDataSample] \ - = inference_topdown(model, f, d, bbox_format='xyxy') + = inference_topdown(model, f, d[..., :4], bbox_format='xyxy') pose_data_sample = merge_data_samples(pose_data_samples) pose_data_sample.dataset_meta = model.dataset_meta poses = pose_data_sample.pred_instances.to_dict() diff --git a/mmaction/utils/misc.py b/mmaction/utils/misc.py index f14b8a51c2..bf4358a2f4 100644 --- a/mmaction/utils/misc.py +++ b/mmaction/utils/misc.py @@ -42,7 +42,7 @@ def frame_extract(video_path: str, Args: video_path (str): The video path. short_side (int): Target short-side of the output image. - Defaults to None, means keep original shape. + Defaults to None, means keeping original shape. out_dir (str): The output directory. Defaults to ``'./tmp'``. """ # Load the video, extract frames into OUT_DIR/video_name diff --git a/tools/data/skeleton/README.md b/tools/data/skeleton/README.md index 3ada42e8ef..10244d23a1 100644 --- a/tools/data/skeleton/README.md +++ b/tools/data/skeleton/README.md @@ -26,13 +26,21 @@ We provide links to the pre-processed skeleton annotations, you can directly dow - NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl - NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl - GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl - - GYM 2D skeletons are extracted with ground-truth human bounding boxes, which can be downloaded with link: https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl. Please cite [PoseConv3D](https://arxiv.org/abs/2104.13586) if you use it in your project. + - GYM 2D skeletons are extracted with ground-truth human bounding boxes, which can be downloaded with [link](https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl). Please cite [PoseConv3D](https://arxiv.org/abs/2104.13586) if you use it in your project. - UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl - HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl - Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl - Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (Table of contents only, no skeleton annotations) -For Kinetics400, since the skeleton annotations are large, we do not provide the direct download links on aliyun. Please use the following link to download the `kpfiles` and extract it under `$MMACTION2/data/k400` for Kinetics-400 training & testing: https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EeyDCVskqLtClMVVwqD53acBF2FEwkctp3vtRbkLfnKSTw?e=B3SZlM +For Kinetics400, since the skeleton annotations are large, we do not provide the direct download links on aliyun. Please use the following link to download the `kpfiles` and extract it under `$MMACTION2/data/k400` for Kinetics400 training & testing: https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EeyDCVskqLtClMVVwqD53acBF2FEwkctp3vtRbkLfnKSTw?e=B3SZlM + +If you want to generate 2D skeleton annotations of specified video, please install mmdetection and mmpose first, then use the following script to extract skeleton annotations of NTURGB+D video: + +```python +python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl +``` + +please note that, due to the upgrade of mmpose, the inference results may have slight differences from the provided skeleton annotations. ## The Format of Annotations diff --git a/tools/data/skeleton/README_zh-CN.md b/tools/data/skeleton/README_zh-CN.md index fb6de5925a..3754175908 100644 --- a/tools/data/skeleton/README_zh-CN.md +++ b/tools/data/skeleton/README_zh-CN.md @@ -33,20 +33,27 @@ bash download_annotations.sh ${DATASET} 对于无法进行姿态提取的用户,这里提供了上述流程的输出结果,分别对应 NTURGB-D 数据集的 4 个部分: -- ntu60_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_train.pkl -- ntu60_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_val.pkl -- ntu120_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_train.pkl -- ntu120_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_val.pkl -- hmdb51: https://download.openmmlab.com/mmaction/posec3d/hmdb51.pkl -- ucf101: https://download.openmmlab.com/mmaction/posec3d/ucf101.pkl +- NTURGB+D \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_2d.pkl +- NTURGB+D \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_3d.pkl +- NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl +- NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl +- GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl + - GYM 2D 姿态标注文件是基于运动员的真实标注框生成的,用户可以从这个[链接](https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl)下载真实标注框。如果你在项目中使用了该数据,请引用 [PoseConv3D](https://arxiv.org/abs/2104.13586) +- UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl +- HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl +- Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl +- Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (只包含数据列表,没有姿态标注文件) -若想生成单个视频的 2D 姿态标注文件,首先,用户需要由源码安装 mmdetection 和 mmpose。之后,用户需要在 `ntu_pose_extraction.py` 中指定 `mmdet_root` 和 `mmpose_root` 变量。 -最后,用户可使用以下脚本进行 NTURGB+D 视频的姿态提取: +由于 Kinetics400 数据集姿态标注文件过大,我们不提供阿里云的下载链接,请使用此[链接](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EeyDCVskqLtClMVVwqD53acBF2FEwkctp3vtRbkLfnKSTw?e=B3SZlM)下载 `kpfiles`,解压到 `$MMACTION2/data/k400` 目录下,用于 Kinetics400 的训练和测试。 + +若想生成单个视频的 2D 姿态标注文件,用户在安装 mmdetection 和 mmpose 之后,可使用以下脚本进行 NTURGB+D 视频的姿态提取: ```python python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl ``` +请注意,由于 mmpose 算法库升级,此脚本的推理结果与提供的姿态点数据集可能略有差异。 + 在用户获得数据集某部分所有视频的姿态标注文件(如 `ntu60_xsub_val`)后,可以将其集合成一个 list 数据并保存为 `ntu60_xsub_val.pkl`。用户可用这些大型 pickle 文件进行训练和测试。 ## PoseC3D 的标注文件格式 diff --git a/tools/data/skeleton/ntu_pose_extraction.py b/tools/data/skeleton/ntu_pose_extraction.py index 17af16e749..d60fefdd97 100644 --- a/tools/data/skeleton/ntu_pose_extraction.py +++ b/tools/data/skeleton/ntu_pose_extraction.py @@ -1,82 +1,24 @@ # Copyright (c) OpenMMLab. All rights reserved. import abc import argparse -import os import os.path as osp -import random as rd -import shutil -import string from collections import defaultdict +from tempfile import TemporaryDirectory -import cv2 -import mmcv +import mmengine import numpy as np -try: - from mmdet.apis import inference_detector, init_detector -except (ImportError, ModuleNotFoundError): - raise ImportError('Failed to import `inference_detector` and ' - '`init_detector` form `mmdet.apis`. These apis are ' - 'required in this script! ') - -try: - from mmpose.apis import inference_top_down_pose_model, init_pose_model -except (ImportError, ModuleNotFoundError): - raise ImportError('Failed to import `inference_top_down_pose_model` and ' - '`init_pose_model` form `mmpose.apis`. These apis are ' - 'required in this script! ') - -mmdet_root = '' -mmpose_root = '' +from mmaction.apis import detection_inference, pose_inference +from mmaction.utils import frame_extract args = abc.abstractproperty() -args.det_config = f'{mmdet_root}/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py' # noqa: E501 +args.det_config = 'demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py' # noqa: E501 args.det_checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' # noqa: E501 args.det_score_thr = 0.5 -args.pose_config = f'{mmpose_root}/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py' # noqa: E501 +args.pose_config = 'demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py' # noqa: E501 args.pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth' # noqa: E501 -def gen_id(size=8): - chars = string.ascii_uppercase + string.digits - return ''.join(rd.choice(chars) for _ in range(size)) - - -def extract_frame(video_path): - dname = gen_id() - os.makedirs(dname, exist_ok=True) - frame_tmpl = osp.join(dname, 'img_{:05d}.jpg') - vid = cv2.VideoCapture(video_path) - frame_paths = [] - flag, frame = vid.read() - cnt = 0 - while flag: - frame_path = frame_tmpl.format(cnt + 1) - frame_paths.append(frame_path) - - cv2.imwrite(frame_path, frame) - cnt += 1 - flag, frame = vid.read() - - return frame_paths - - -def detection_inference(args, frame_paths): - model = init_detector(args.det_config, args.det_checkpoint, args.device) - assert model.CLASSES[0] == 'person', ('We require you to use a detector ' - 'trained on COCO') - results = [] - print('Performing Human Detection for each frame') - prog_bar = mmcv.ProgressBar(len(frame_paths)) - for frame_path in frame_paths: - result = inference_detector(model, frame_path) - # We only keep human detections with score larger than det_score_thr - result = result[0][result[0][:, 4] >= args.det_score_thr] - results.append(result) - prog_bar.update() - return results - - def intersection(b0, b1): l, r = max(b0[0], b1[0]), min(b0[2], b1[2]) u, d = max(b0[1], b1[1]), min(b0[3], b1[3]) @@ -227,7 +169,7 @@ def tracklets2bbox(tracklet, num_frame): mind = np.abs(k - idx) mink = k bbox[idx] = bboxd[mink] - return bad, bbox + return bad, bbox[:, None, :] def bboxes2bbox(bbox, num_frame): @@ -287,41 +229,34 @@ def ntu_det_postproc(vid, det_results): return bboxes2bbox(det_results, len(det_results)) -def pose_inference(args, frame_paths, det_results): - model = init_pose_model(args.pose_config, args.pose_checkpoint, - args.device) - print('Performing Human Pose Estimation for each frame') - prog_bar = mmcv.ProgressBar(len(frame_paths)) - - num_frame = len(det_results) - num_person = max([len(x) for x in det_results]) - kp = np.zeros((num_person, num_frame, 17, 3), dtype=np.float32) - - for i, (f, d) in enumerate(zip(frame_paths, det_results)): - # Align input format - d = [dict(bbox=x) for x in list(d) if x[-1] > 0.5] - pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0] - for j, item in enumerate(pose): - kp[j, i] = item['keypoints'] - prog_bar.update() - return kp - - def ntu_pose_extraction(vid, skip_postproc=False): - frame_paths = extract_frame(vid) - det_results = detection_inference(args, frame_paths) + tmp_dir = TemporaryDirectory() + frame_paths, _ = frame_extract(vid, out_dir=tmp_dir.name) + det_results, _ = detection_inference( + args.det_config, + args.det_checkpoint, + frame_paths, + args.det_score_thr, + device=args.device, + with_score=True) + if not skip_postproc: det_results = ntu_det_postproc(vid, det_results) - pose_results = pose_inference(args, frame_paths, det_results) + pose_results, _ = pose_inference(args.pose_config, args.pose_checkpoint, + frame_paths, det_results, args.device) + anno = dict() - anno['keypoint'] = pose_results[..., :2] - anno['keypoint_score'] = pose_results[..., 2] + anno['keypoint'] = np.stack( + [pose['keypoints'].astype(np.float32) for pose in pose_results], + axis=1) + anno['keypoint_score'] = np.stack( + [pose['keypoint_scores'] for pose in pose_results], axis=1) anno['frame_dir'] = osp.splitext(osp.basename(vid))[0] anno['img_shape'] = (1080, 1920) anno['original_shape'] = (1080, 1920) - anno['total_frames'] = pose_results.shape[1] + anno['total_frames'] = len(pose_results) anno['label'] = int(osp.basename(vid).split('A')[1][:3]) - 1 - shutil.rmtree(osp.dirname(frame_paths[0])) + tmp_dir.cleanup() return anno @@ -344,4 +279,4 @@ def parse_args(): args.output = global_args.output args.skip_postproc = global_args.skip_postproc anno = ntu_pose_extraction(args.video, args.skip_postproc) - mmcv.dump(anno, args.output) + mmengine.dump(anno, args.output) From d6dd49137d9f6b284ff45929a397294a789fc9f3 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Thu, 16 Mar 2023 20:56:18 +0800 Subject: [PATCH 15/36] [doc] cancel compile pdf docs (#2302) --- .readthedocs.yml | 3 ++- tests/models/recognizers/test_recognizer2d.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 6cfbf5d310..070c61832b 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,6 +1,7 @@ version: 2 -formats: all +formats: + - epub python: version: 3.7 diff --git a/tests/models/recognizers/test_recognizer2d.py b/tests/models/recognizers/test_recognizer2d.py index 300e63b460..a1c8ef4b1f 100644 --- a/tests/models/recognizers/test_recognizer2d.py +++ b/tests/models/recognizers/test_recognizer2d.py @@ -190,7 +190,7 @@ def test_tpn(): recognizer = MODELS.build(config.model) - input_shape = (1, 8, 3, 224, 224) + input_shape = (1, 8, 3, 32, 32) demo_inputs = generate_recognizer_demo_inputs(input_shape) imgs = demo_inputs['imgs'] From dabe21abadce5435c85c45454083375ffa848082 Mon Sep 17 00:00:00 2001 From: LinXiaoZheng <90811472+Zheng-LinXiao@users.noreply.github.com> Date: Wed, 22 Mar 2023 14:05:57 +0800 Subject: [PATCH 16/36] [Docs] Add the docs about readme_zh-CN.md (#2252) --- README.md | 23 +++- README_zh-CN.md | 317 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 337 insertions(+), 3 deletions(-) create mode 100644 README_zh-CN.md diff --git a/README.md b/README.md index ab41e0f96e..f3a575f4ce 100644 --- a/README.md +++ b/README.md @@ -48,10 +48,12 @@ +English | [简体中文](/README_zh-CN.md) + ## Introduction MMAction2 is an open-source toolbox for video understanding based on PyTorch. -It is a part of the [OpenMMLab](http://openmmlab.org/) project. +It is a part of the [OpenMMLab](http://openmmlab.com/) project. The 1.x branch works with **PyTorch 1.6+**. @@ -84,7 +86,7 @@ The 1.x branch works with **PyTorch 1.6+**. ## What's New -**Release (2022.02.10)**: v1.0.0rc3 with the following new features: +**Release (2023.02.10)**: v1.0.0rc3 with the following new features: - Support Action Recognition model UniFormer V1(ICLR'2022), UniFormer V2(Arxiv'2022). - Support training MViT V2(CVPR'2022), and MaskFeat(CVPR'2022) fine-tuning. @@ -94,6 +96,20 @@ The 1.x branch works with **PyTorch 1.6+**. Please refer to [install.md](https://mmaction2.readthedocs.io/en/1.x/get_started.html) for more detailed instructions. +```shell +conda create --name openmmlab python=3.8 -y +conda activate open-mmlab +conda install pytorch torchvision -c pytorch # This command will automatically install the latest version PyTorch and cudatoolkit, please check whether they match your environment. +pip install -U openmim +mim install mmengine 'mmcv>=2.0.0rc1' +mim install "mmdet>=3.0.0rc5" # optional +mim install "mmpose>=1.0.0rc0" # optional +git clone https://github.com/open-mmlab/mmaction2.git +cd mmaction2 +git checkout 1.x +pip3 install -e . +``` + ## Supported Methods @@ -271,7 +287,7 @@ If you find this project useful in your research, please consider cite: ## Contributing -We appreciate all contributions to improve MMAction2. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/1.x/CONTRIBUTING.md) in MMCV for more details about the contributing guideline. +We appreciate all contributions to improve MMAction2. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING.md) in MMCV for more details about the contributing guideline. ## Acknowledgement @@ -287,6 +303,7 @@ We wish that the toolbox and benchmark could serve the growing research communit - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark. - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark. - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. +- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark. - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark. - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark. - [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox. diff --git a/README_zh-CN.md b/README_zh-CN.md new file mode 100644 index 0000000000..5d0d091cd1 --- /dev/null +++ b/README_zh-CN.md @@ -0,0 +1,317 @@ +
+ +
 
+
+ OpenMMLab 官网 + + + HOT + + +      + OpenMMLab 开放平台 + + + TRY IT OUT + + +
+ +[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/1.x/) +[![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions) +[![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2) +[![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/) +[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/blob/master/LICENSE) +[![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues) +[![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues) + +[📘文档](https://mmaction2.readthedocs.io/zh_CN//1.x/) | +[🛠️安装指南](https://mmaction2.readthedocs.io/zh_CN//1.x/get_started.html) | +[👀模型库](https://mmaction2.readthedocs.io/zh_CN//1.x/modelzoo.html) | +[🆕更新](https://mmaction2.readthedocs.io/zh_CN/1.x/notes/changelog.html) | +[🚀进行中项目](https://github.com/open-mmlab/mmaction2/projects) | +[🤔问题反馈](https://github.com/open-mmlab/mmaction2/issues/new/choose) + +
+ +
+ + + + + + + + + + + +
+ +[English](/README.md) | 简体中文 + +## 简介 + +MMAction2 是一款基于 PyTorch 的视频理解开源工具箱,是 [OpenMMLab](https://openmmlab.com/) 项目的成员之一 + +1.x 分支代码目前支持 **PyTorch 1.6以上** 的版本 + +
+
+
+

Kinetics-400 上的动作识别

+
+
+
+

NTURGB+D-120 上的基于人体姿态的动作识别

+
+
+
+
+

Kinetics-400 上的基于 skeleton 的时空动作检测和动作识别

+
+
+
+

AVA-2.1 上的时空动作检测

+
+ +## 主要特性 + +- **模块设计**:MMAction2 将统一的视频理解框架解耦成不同的模块组件,通过组合不同的模块组件,用户可以便捷地构建自定义的视频理解模型 + +- **支持多种任务和数据集**:MMAction2 支持多种视频理解任务,包括动作识别,时序动作检测,时空动作检测以及基于人体姿态的动作识别 + +- **详尽的单元测试和文档**:MMAction2 提供了详尽的说明文档,API 接口说明,全面的单元测试,以供社区参考 + +## 更新记录 + +**v1.0.0rc3 版本 (2023.02.10)**: + +- 支持动作识别模型 UniFormer V1(ICLR'2022),UniFormer V2(Arxiv'2022) +- 支持训练 MViT V2(CVPR'2022)和 MaskFeat(CVPR'2022)微调 +- 为 MMAction2 模型提供统一的推理接口实现视频分析任务的快速预测 ([demo](https://github.com/open-mmlab/mmaction2/blob/dev-1.x/demo/README.md#inferencer)) + +## 安装 + +MMAction2 依赖 [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine), [MMDetection](https://github.com/open-mmlab/mmdetection) (可选), [MMPose](https://github.com/open-mmlab/mmpose) (可选),以下是安装的简要步骤。 +更详细的安装指南请参考 [install.md](https://mmaction2.readthedocs.io/zh_CN/1.x/get_started.html) 。 + +```shell +conda create --name openmmlab python=3.8 -y +conda activate open-mmlab +conda install pytorch torchvision -c pytorch # 以上命令将自动安装最新版本的 PyTorch 和 cudatoolkit,请检查它们是否和你的环境匹配 +pip install -U openmim +mim install mmengine 'mmcv>=2.0.0rc1' +mim install "mmdet>=3.0.0rc5" # 可选 +mim install "mmpose>=1.0.0rc0" # 可选 +git clone https://github.com/open-mmlab/mmaction2.git +cd mmaction2 +git checkout 1.x +pip3 install -e . +``` + +## 模型库 + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
行为识别方法
C3D (CVPR'2014)TSN (ECCV'2016)I3D (CVPR'2017)C2D (CVPR'2018)I3D Non-Local (CVPR'2018)
R(2+1)D (CVPR'2018)TRN (ECCV'2018)TSM (ICCV'2019)TSM Non-Local (ICCV'2019)SlowOnly (ICCV'2019)
SlowFast (ICCV'2019)CSN (ICCV'2019)TIN (AAAI'2020)TPN (CVPR'2020)X3D (CVPR'2020)
MultiModality: Audio (ArXiv'2020)TANet (ArXiv'2020)TimeSformer (ICML'2021)VideoSwin (CVPR'2022)VideoMAE (NeurIPS'2022)
MViT V2 (CVPR'2022)UniFormer V1 (ICLR'2022)UniFormer V2 (Arxiv'2022)
时序动作检测方法
SSN (ICCV'2017)BSN (ECCV'2018)BMN (ICCV'2019)
时空动作检测方法
ACRN (ECCV'2018)SlowOnly+Fast R-CNN (ICCV'2019)SlowFast+Fast R-CNN (ICCV'2019)LFB (CVPR'2019)
基于骨骼点的动作识别方法
ST-GCN (AAAI'2018)2s-AGCN (CVPR'2019)PoseC3D (CVPR'2022)STGCN++ (ArXiv'2022)
+ +各个模型的结果和设置都可以在对应的 config 目录下的 *README_zh-CN.md* 中查看。整体的概况也可也在 [**模型库**](https://mmaction2.readthedocs.io/zh_CN/1.x/modelzoo.html) 页面中查看。 + +MMAction2 将跟进学界的最新进展,并支持更多算法和框架。如果您对 MMAction2 有任何功能需求,请随时在 [问题](https://github.com/open-mmlab/mmaction2/issues/19) 中留言。 + +## 数据集 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
动作识别数据集
HMDB51 (Homepage) (ICCV'2011)UCF101 (Homepage) (CRCV-IR-12-01)ActivityNet (Homepage) (CVPR'2015)Kinetics-[400/600/700] (Homepage) (CVPR'2017)
SthV1 (ICCV'2017)SthV2 (Homepage) (ICCV'2017)Diving48 (Homepage) (ECCV'2018)Jester (Homepage) (ICCV'2019)
Moments in Time (Homepage) (TPAMI'2019)Multi-Moments in Time (Homepage) (ArXiv'2019)HVU (Homepage) (ECCV'2020)OmniSource (Homepage) (ECCV'2020)
FineGYM (Homepage) (CVPR'2020)
时序动作检测数据集
THUMOS14 (Homepage) (THUMOS Challenge 2014)ActivityNet (Homepage) (CVPR'2015)
时空动作检测数据集
UCF101-24* (Homepage) (CRCV-IR-12-01)JHMDB* (Homepage) (ICCV'2015)AVA (Homepage) (CVPR'2018)AVA-Kinetics (Homepage) (Arxiv'2020)
基于骨骼点的动作识别数据集
PoseC3D-FineGYM (Homepage) (ArXiv'2021)PoseC3D-NTURGB+D (Homepage) (ArXiv'2021)PoseC3D-UCF101 (Homepage) (ArXiv'2021)PoseC3D-HMDB51 (Homepage) (ArXiv'2021)
+ +标记 * 代表对应数据集并未被完全支持,但提供相应的数据准备步骤。整体的概况也可也在 [**数据集**](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 页面中查看。 + +## 数据集准备 + +请参考 [数据准备](https://mmaction2.readthedocs.io/en/1.x/user_guides/2_data_prepare.html) 了解数据集准备概况。所有支持的数据集都列于 [数据集清单](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 中。 + +## FAQ + +请参考 [FAQ](docs/zh_cn/notes/faq.md) 了解其他用户的常见问题。 + +## 相关工作 + +目前有许多研究工作或工程项目基于 MMAction2 搭建,例如: + +- Video Swin Transformer. [\[论文\]](https://arxiv.org/abs/2106.13230)[\[代码\]](https://github.com/SwinTransformer/Video-Swin-Transformer) +- Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 **Oral**. [\[论文\]](https://arxiv.org/abs/2107.10161)[\[代码\]](https://github.com/Cogito2012/DEAR) +- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 **Oral**. [\[论文\]](https://arxiv.org/abs/2103.17263)[\[代码\]](https://github.com/xvjiarui/VFS) + +更多详情可见 [相关工作](docs/en/notes/projects.md) 。 + +## 许可 + +该项目开源自 [Apache 2.0 license](LICENSE). + +## 引用 + +如果你觉得 MMAction2 对你的研究有所帮助,可以考虑引用它: + +```BibTeX +@misc{2020mmaction2, + title={OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark}, + author={MMAction2 Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmaction2}}, + year={2020} +} +``` + +## 参与贡献 + +我们非常欢迎用户对于 MMAction2 做出的任何贡献,可以参考 [贡献指南](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING_zh-CN.md) 文件了解更多细节。 + +## 致谢 + +MMAction2 是一款由不同学校和公司共同贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者,以及提供宝贵反馈的用户。 +我们希望该工具箱和基准测试可以为社区提供灵活的代码工具,供用户复现现有算法并开发自己的新模型,从而不断为开源社区提供贡献。 + +## OpenMMLab 的其他项目 + +- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models. +- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision. +- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages. +- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark. +- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark. +- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. +- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱和基准测试 +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark. +- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark. +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox. +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark. +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark. +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark. +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark. +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark. +- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark. +- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark. +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark. +- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox. +- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox. +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework. From 56e237df21e38aa0174267dfdf213e9e14b123ac Mon Sep 17 00:00:00 2001 From: wxDai Date: Wed, 22 Mar 2023 15:40:56 +0800 Subject: [PATCH 17/36] [Update] update detection related folders (#2262) --- .../detection/_base_/models/slowonly_r50.py | 54 ----- .../_base_/models/slowonly_r50_nl.py | 51 ---- configs/detection/acrn/README.md | 42 +--- configs/detection/acrn/metafile.yml | 18 +- ...ned-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py | 69 ++++-- ...ned-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py | 109 ++++++++- configs/detection/ava/README.md | 125 ---------- configs/detection/ava/metafile.yml | 227 ------------------ ...pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py | 50 ---- ...etrained-r101_8xb16-8x8x1-20e_ava21-rgb.py | 72 ------ ...ained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py | 16 -- ...rained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py | 74 ------ ...etrained-r50_8xb16-4x16x1-20e_ava21-rgb.py | 9 - configs/detection/ava_kinetics/README.md | 103 -------- configs/detection/slowfast/README.md | 96 ++++++++ configs/detection/slowfast/metafile.yml | 121 ++++++++++ ...-r50-context_8xb16-4x16x1-20e_ava21-rgb.py | 0 ...ral-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py | 0 ...etrained-r50_8xb16-4x16x1-20e_ava21-rgb.py | 90 ++++++- ...ned-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py | 86 ++++++- ...pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py | 171 +++++++++++++ ...-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py | 0 configs/detection/slowonly/README.md | 126 ++++++++++ configs/detection/slowonly/metafile.yml | 102 ++++++++ ...re-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py | 65 ++++- ...pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py | 60 ++++- ...nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py | 0 ...d_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py | 0 ...ral-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py | 0 ...context_8xb8-8x8x1-10e_ava-kinetics-rgb.py | 0 ...8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py | 17 +- ...re-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py | 0 ...pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py | 0 ...etrained-r101_8xb16-8x8x1-20e_ava21-rgb.py | 151 ++++++++++++ ...ained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py | 160 ++++++++++++ ...rained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py | 159 ++++++++++++ ...etrained-r50_8xb16-4x16x1-20e_ava21-rgb.py | 69 +++++- ...etrained-r50_8xb16-4x16x1-20e_ava21-rgb.py | 153 ++++++++++++ .../models/backbones/resnet3d_slowonly.py | 10 - model-index.yml | 3 +- 40 files changed, 1746 insertions(+), 912 deletions(-) delete mode 100644 configs/detection/_base_/models/slowonly_r50.py delete mode 100644 configs/detection/_base_/models/slowonly_r50_nl.py delete mode 100644 configs/detection/ava/README.md delete mode 100644 configs/detection/ava/metafile.yml delete mode 100644 configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py delete mode 100644 configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py delete mode 100644 configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py delete mode 100644 configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py delete mode 100644 configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py delete mode 100644 configs/detection/ava_kinetics/README.md create mode 100644 configs/detection/slowfast/README.md create mode 100644 configs/detection/slowfast/metafile.yml rename configs/detection/{ava => slowfast}/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py (100%) rename configs/detection/{ava => slowfast}/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py (100%) rename configs/detection/{ava => slowfast}/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py (52%) rename configs/detection/{ava => slowfast}/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py (52%) create mode 100644 configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py rename configs/detection/{ava => slowfast}/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py (100%) create mode 100644 configs/detection/slowonly/README.md create mode 100644 configs/detection/slowonly/metafile.yml rename configs/detection/{ava_kinetics => slowonly}/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py (65%) rename configs/detection/{ava_kinetics => slowonly}/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py (70%) rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py (100%) rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py (100%) rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py (100%) rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py (100%) rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py (85%) rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py (100%) rename configs/detection/{ava_kinetics => slowonly}/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py (100%) create mode 100644 configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py create mode 100644 configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py create mode 100644 configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py rename configs/detection/{ava => slowonly}/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py (56%) create mode 100644 configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py diff --git a/configs/detection/_base_/models/slowonly_r50.py b/configs/detection/_base_/models/slowonly_r50.py deleted file mode 100644 index 4a06a4ab53..0000000000 --- a/configs/detection/_base_/models/slowonly_r50.py +++ /dev/null @@ -1,54 +0,0 @@ -url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' - 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-' - 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' - 'kinetics400-rgb_20220901-e7b65fad.pth') - -model = dict( - type='FastRCNN', - _scope_='mmdet', - init_cfg=dict(type='Pretrained', checkpoint=url), - backbone=dict( - type='ResNet3dSlowOnly', - depth=50, - pretrained=None, - pretrained2d=False, - lateral=False, - num_stages=4, - conv1_kernel=(1, 7, 7), - conv1_stride_t=1, - pool1_stride_t=1, - spatial_strides=(1, 2, 2, 1)), - roi_head=dict( - type='AVARoIHead', - bbox_roi_extractor=dict( - type='SingleRoIExtractor3D', - roi_layer_type='RoIAlign', - output_size=8, - with_temporal_pool=True), - bbox_head=dict( - type='BBoxHeadAVA', - in_channels=2048, - num_classes=81, - multilabel=True, - dropout_ratio=0.5)), - data_preprocessor=dict( - type='ActionDataPreprocessor', - _scope_='mmaction', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - format_shape='NCTHW'), - train_cfg=dict( - rcnn=dict( - assigner=dict( - type='MaxIoUAssignerAVA', - pos_iou_thr=0.9, - neg_iou_thr=0.9, - min_pos_iou=0.9), - sampler=dict( - type='RandomSampler', - num=32, - pos_fraction=1, - neg_pos_ub=-1, - add_gt_as_proposals=True), - pos_weight=1.0)), - test_cfg=dict(rcnn=None)) diff --git a/configs/detection/_base_/models/slowonly_r50_nl.py b/configs/detection/_base_/models/slowonly_r50_nl.py deleted file mode 100644 index 6dcdc30bfc..0000000000 --- a/configs/detection/_base_/models/slowonly_r50_nl.py +++ /dev/null @@ -1,51 +0,0 @@ -# model setting -model = dict( - type='mmdet.FastRCNN', - backbone=dict( - type='ResNet3dSlowOnly', - depth=50, - pretrained=None, - pretrained2d=False, - lateral=False, - num_stages=4, - conv1_kernel=(1, 7, 7), - conv1_stride_t=1, - pool1_stride_t=1, - spatial_strides=(1, 2, 2, 1), - norm_cfg=dict(type='BN3d', requires_grad=True), - non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), - non_local_cfg=dict( - sub_sample=True, - use_scale=True, - norm_cfg=dict(type='BN3d', requires_grad=True), - mode='embedded_gaussian')), - roi_head=dict( - type='AVARoIHead', - bbox_roi_extractor=dict( - type='SingleRoIExtractor3D', - roi_layer_type='RoIAlign', - output_size=8, - with_temporal_pool=True), - bbox_head=dict( - type='BBoxHeadAVA', - in_channels=2048, - num_classes=81, - multilabel=True, - dropout_ratio=0.5)), - train_cfg=dict( - rcnn=dict( - assigner=dict( - type='MaxIoUAssignerAVA', - pos_iou_thr=0.9, - neg_iou_thr=0.9, - min_pos_iou=0.9, - iou_calculator=dict(type='mmdet.BboxOverlaps2D')), - sampler=dict( - type='mmdet.RandomSampler', - num=32, - pos_fraction=1, - neg_pos_ub=-1, - add_gt_as_proposals=True), - pos_weight=1.0, - debug=False)), - test_cfg=dict(rcnn=None)) diff --git a/configs/detection/acrn/README.md b/configs/detection/acrn/README.md index a9af00da0c..d08efb6d2d 100644 --- a/configs/detection/acrn/README.md +++ b/configs/detection/acrn/README.md @@ -20,23 +20,19 @@ Current state-of-the-art approaches for spatio-temporal action localization rely ### AVA2.1 -| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | gpu_mem(M) | config | ckpt | log | -| :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :--------: | :---------------------------------------: | :-------------------------------------: | :-------------------------------------: | -| 8x8x1 | raw | 8 | SlowFast ResNet50 | Kinetics-400 | 27.58 | 15263 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb_20220906-0dae1a90.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.log) | +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: | +| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 27.65 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb_20220906-0dae1a90.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.log) | ### AVA2.2 -| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | gpu_mem(M) | config | ckpt | log | -| :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :--------: | :---------------------------------------: | :-------------------------------------: | :-------------------------------------: | -| 8x8x1 | raw | 8 | SlowFast ResNet50 | Kinetics-400 | 27.63 | 15263 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb_20220906-66ec24a2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.log) | +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: | +| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 27.71 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb_20220906-66ec24a2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.log) | -Note: +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. -1. The **gpus** indicates the number of gpu we used to get the checkpoint. - According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, - e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. - -For more details on data preparation, you can refer to to [AVA Data Preparation](/tools/data/ava/README.md). +For more details on data preparation, you can refer to [AVA](/tools/data/ava/README.md). ## Train @@ -46,14 +42,14 @@ You can use the following command to train a model. python tools/train.py ${CONFIG_FILE} [optional arguments] ``` -Example: train ACRN with SlowFast backbone on AVA in a deterministic option. +Example: train ACRN with SlowFast backbone on AVA2.1 in a deterministic option with periodic validation. ```shell python tools/train.py configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py \ - --cfg-options randomness.seed=0 randomness.deterministic=True + --seed 0 --deterministic ``` -For more details and optional arguments infos, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). ## Test @@ -63,29 +59,17 @@ You can use the following command to test a model. python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] ``` -Example: test ACRN with SlowFast backbone on AVA and dump the result to a pkl file. +Example: test ACRN with SlowFast backbone on AVA2.1 and dump the result to a pkl file. ```shell python tools/test.py configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py \ checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details and optional arguments infos, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). ## Citation - - -```BibTeX -@inproceedings{gu2018ava, - title={Ava: A video dataset of spatio-temporally localized atomic visual actions}, - author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others}, - booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, - pages={6047--6056}, - year={2018} -} -``` - ```BibTeX @inproceedings{sun2018actor, title={Actor-centric relation network}, diff --git a/configs/detection/acrn/metafile.yml b/configs/detection/acrn/metafile.yml index 3212cb7dc8..9db11da474 100644 --- a/configs/detection/acrn/metafile.yml +++ b/configs/detection/acrn/metafile.yml @@ -1,9 +1,9 @@ Collections: -- Name: ACRN - README: configs/detection/acrn/README.md - Paper: - URL: https://arxiv.org/abs/1807.10982 - Title: "Actor-Centric Relation Network" + - Name: ACRN + README: configs/detection/acrn/README.md + Paper: + URL: https://arxiv.org/abs/1807.10982 + Title: "Actor-Centric Relation Network" Models: - Name: slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb @@ -14,7 +14,6 @@ Models: Batch Size: 8 Epochs: 10 Pretrained: Kinetics-400 - Resolution: short-side 320 Training Data: AVA v2.1 Training Resources: 8 GPUs Modality: RGB @@ -22,7 +21,7 @@ Models: - Dataset: AVA v2.1 Task: Action Detection Metrics: - mAP: 27.58 + mAP: 27.65 Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb_20220906-0dae1a90.pth @@ -34,14 +33,13 @@ Models: Batch Size: 8 Epochs: 10 Pretrained: Kinetics-400 - Resolution: short-side 320 Training Data: AVA v2.2 Training Resources: 8 GPUs Modality: RGB Results: - - Dataset: AVA v2.1 + - Dataset: AVA v2.2 Task: Action Detection Metrics: - mAP: 27.63 + mAP: 27.71 Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb_20220906-66ec24a2.pth diff --git a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py index 641364bcce..10928a96ee 100644 --- a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py +++ b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py @@ -1,16 +1,16 @@ -_base_ = [ - '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py' -] +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth') model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), backbone=dict( - _delete_=True, - type='ResNet3dSlowFast', - _scope_='mmaction', - pretrained=( - 'https://download.openmmlab.com/mmaction/recognition/slowfast/' - 'slowfast_r50_8x8x1_256e_kinetics400_rgb/' - 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth'), + type='mmaction.ResNet3dSlowFast', + pretrained=None, resample_rate=4, speed_ratio=4, channel_ratio=8, @@ -37,17 +37,44 @@ pool1_stride_t=1, spatial_strides=(1, 2, 2, 1))), roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), shared_head=dict(type='ACRNHead', in_channels=4608, out_channels=2304), - bbox_head=dict(in_channels=2304))) + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2304, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) dataset_type = 'AVADataset' data_root = 'data/ava/rawframes' anno_root = 'data/ava/annotations' -proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' - 'recall_93.9.pkl') -proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' - ann_file_train = f'{anno_root}/ava_train_v2.1.csv' ann_file_val = f'{anno_root}/ava_val_v2.1.csv' @@ -56,9 +83,17 @@ label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +file_client_args = dict( + io_backend='petrel', + path_mapping=dict({'data/ava': 's254:s3://openmmlab/datasets/action/ava'})) train_pipeline = [ dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='RandomRescale', scale_range=(256, 320)), dict(type='RandomCrop', size=256), dict(type='Flip', flip_ratio=0.5), @@ -69,7 +104,7 @@ val_pipeline = [ dict( type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict(type='FormatShape', input_format='NCTHW', collapse=True), dict(type='PackActionInputs') diff --git a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py index 02992c654a..4537d25cc7 100644 --- a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py +++ b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py @@ -1,5 +1,75 @@ -_base_ = [('slowfast-acrn_kinetics400-pretrained-r50' - '_8xb8-8x8x1-cosine-10e_ava21-rgb.py')] +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowFast', + pretrained=None, + resample_rate=4, + speed_ratio=4, + channel_ratio=8, + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + fusion_kernel=7, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + spatial_strides=(1, 2, 2, 1)), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1))), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + shared_head=dict(type='ACRNHead', in_channels=4608, out_channels=2304), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2304, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) dataset_type = 'AVADataset' data_root = 'data/ava/rawframes' @@ -17,9 +87,13 @@ 'recall_93.9.pkl') proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' +file_client_args = dict(io_backend='disk') +file_client_args = dict( + io_backend='petrel', + path_mapping=dict({'data/ava': 's254:s3://openmmlab/datasets/action/ava'})) train_pipeline = [ dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='RandomRescale', scale_range=(256, 320)), dict(type='RandomCrop', size=256), dict(type='Flip', flip_ratio=0.5), @@ -30,7 +104,7 @@ val_pipeline = [ dict( type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict(type='FormatShape', input_format='NCTHW', collapse=True), dict(type='PackActionInputs') @@ -71,3 +145,30 @@ label_file=label_file, exclude_file=exclude_file_val) test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=2, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=0, + by_epoch=True, + begin=2, + end=10, + convert_to_iter_based=True) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/detection/ava/README.md b/configs/detection/ava/README.md deleted file mode 100644 index 1f6354641b..0000000000 --- a/configs/detection/ava/README.md +++ /dev/null @@ -1,125 +0,0 @@ -# AVA - -[Ava: A video dataset of spatio-temporally localized atomic visual actions](https://openaccess.thecvf.com/content_cvpr_2018/html/Gu_AVA_A_Video_CVPR_2018_paper.html) - - - -
- -
- -## Abstract - - - -This paper introduces a video dataset of spatio-temporally localized Atomic Visual Actions (AVA). The AVA dataset densely annotates 80 atomic visual actions in 430 15-minute video clips, where actions are localized in space and time, resulting in 1.58M action labels with multiple labels per person occurring frequently. The key characteristics of our dataset are: (1) the definition of atomic visual actions, rather than composite actions; (2) precise spatio-temporal annotations with possibly multiple annotations for each person; (3) exhaustive annotation of these atomic actions over 15-minute video clips; (4) people temporally linked across consecutive segments; and (5) using movies to gather a varied set of action representations. This departs from existing datasets for spatio-temporal action recognition, which typically provide sparse annotations for composite actions in short video clips. We will release the dataset publicly. -AVA, with its realistic scene and action complexity, exposes the intrinsic difficulty of action recognition. To benchmark this, we present a novel approach for action localization that builds upon the current state-of-the-art methods, and demonstrates better performance on JHMDB and UCF101-24 categories. While setting a new state of the art on existing datasets, the overall results on AVA are low at 15.6% mAP, underscoring the need for developing new approaches for video understanding. - - - -
- -
- - - -```BibTeX -@inproceedings{feichtenhofer2019slowfast, - title={Slowfast networks for video recognition}, - author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming}, - booktitle={Proceedings of the IEEE international conference on computer vision}, - pages={6202--6211}, - year={2019} -} -``` - -## Results and Models - -### AVA2.1 - -| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | gpu_mem(M) | config | ckpt | log | -| :---------------------: | :--------: | :--: | :----------------------------------: | :----------: | :---: | :--------: | :---------------------------------: | :-------------------------------: | :------------------------------: | -| 4x16x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-400 | 20.76 | 8503 | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) | -| 4x16x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 22.77 | 8503 | [config](/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) | -| 4x16x1 | raw | 8 | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 21.49 | 11870 | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log) | -| 8x8x1 | raw | 8 | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 23.74 | 25375 | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log) | -| 8x8x1 | raw | 8 | SlowOnly ResNet101 | Kinetics-400 | 24.82 | 23477 | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log) | -| 4x16x1 | raw | 8 | SlowFast ResNet50 | Kinetics-400 | 24.27 | 18616 | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) | -| 4x16x1 | raw | 8 | SlowFast ResNet50 (with context) | Kinetics-400 | 25.25 | 18616 | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log) | -| 8x8x1 | raw | 8 | SlowFast ResNet50 | Kinetics-400 | 25.73 | 13802 | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log) | - -### AVA2.2 - -| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | gpu_mem(M) | config | ckpt | log | -| :---------------------: | :--------: | :--: | :----------------------------------: | :----------: | :---: | :--------: | :---------------------------------: | :-------------------------------: | :------------------------------: | -| 8x8x1 | raw | 8 | SlowFast ResNet50 | Kinetics-400 | 25.82 | 10484 | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log) | -| 8x8x1 | raw | 8 | SlowFast ResNet50 (temporal-max) | Kinetics-400 | 26.32 | 10484 | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log) | -| 8x8x1 | raw | 8 | SlowFast ResNet50 (temporal-max, focal loss) | Kinetics-400 | 26.58 | 10484 | [config](/configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log) | - -Note: - -1. The **gpus** indicates the number of gpu we used to get the checkpoint. - According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, - e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. -2. **With context** indicates that using both RoI feature and global pooled feature for classification, which leads to around 1% mAP improvement in general. - -::: - -For more details on data preparation, you can refer to [AVA Data Preparation](/tools/data/ava/README.md). - -## Train - -You can use the following command to train a model. - -```shell -python tools/train.py ${CONFIG_FILE} [optional arguments] -``` - -Example: train the SlowOnly model on AVA in a deterministic option. - -```shell -python tools/train.py configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \ - --cfg-options randomness.seed=0 randomness.deterministic=True -``` - -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). - -## Test - -You can use the following command to test a model. - -```shell -python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] -``` - -Example: test the SlowOnly model on AVA and dump the result to a pkl file. - -```shell -python tools/test.py configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \ - checkpoints/SOME_CHECKPOINT.pth --dump result.pkl -``` - -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). - -## Citation - - - -```BibTeX -@inproceedings{gu2018ava, - title={Ava: A video dataset of spatio-temporally localized atomic visual actions}, - author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others}, - booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, - pages={6047--6056}, - year={2018} -} -``` - -```BibTeX -@article{duan2020omni, - title={Omni-sourced Webly-supervised Learning for Video Recognition}, - author={Duan, Haodong and Zhao, Yue and Xiong, Yuanjun and Liu, Wentao and Lin, Dahua}, - journal={arXiv preprint arXiv:2003.13042}, - year={2020} -} -``` diff --git a/configs/detection/ava/metafile.yml b/configs/detection/ava/metafile.yml deleted file mode 100644 index ec745ad5c4..0000000000 --- a/configs/detection/ava/metafile.yml +++ /dev/null @@ -1,227 +0,0 @@ -Collections: -- Name: AVA - README: configs/detection/ava/README.md - Paper: - URL: https://arxiv.org/abs/1705.08421 - Title: "AVA: A Video Dataset of Spatio-temporally Localized Atomic Visual Actions" - -Models: - - Name: slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb - Config: configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py - In Collection: AVA - Metadata: - Architecture: ResNet50 - Batch Size: 16 - Epochs: 20 - Pretrained: Kinetics-400 - Resolution: short-side 320 - Training Data: AVA v2.1 - Training Resources: 8 GPUs - Modality: RGB - Results: - - Dataset: AVA v2.1 - Task: Action Detection - Metrics: - mAP: 20.76 - Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth - - - Name: slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb - Config: configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py - In Collection: AVA - Metadata: - Architecture: ResNet50 - Batch Size: 16 - Epochs: 20 - Pretrained: Kinetics-700 - Resolution: short-side 320 - Training Data: AVA v2.1 - Training Resources: 8 GPUs - Modality: RGB - Results: - - Dataset: AVA v2.1 - Task: Action Detection - Metrics: - mAP: 22.77 - Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth - - - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb - Config: configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py - In Collection: AVA - Metadata: - Architecture: ResNet50 - Batch Size: 16 - Epochs: 20 - Pretrained: Kinetics-400 - Resolution: short-side 320 - Training Data: AVA v2.1 - Training Resources: 8 GPUs - Modality: RGB - Results: - - Dataset: AVA v2.1 - Task: Action Detection - Metrics: - mAP: 21.49 - Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth - - - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb - Config: configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py - In Collection: AVA - Metadata: - Architecture: ResNet50 - Batch Size: 16 - Epochs: 20 - Pretrained: Kinetics-400 - Resolution: short-side 320 - Training Data: AVA v2.1 - Training Resources: 8 GPUs - Modality: RGB - Results: - - Dataset: AVA v2.1 - Task: Action Detection - Metrics: - mAP: 23.47 - Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth - - - Name: slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb - Config: configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py - In Collection: AVA - Metadata: - Architecture: ResNet101 - Batch Size: 16 - Epochs: 20 - Pretrained: Kinetics-400 - Resolution: short-side 320 - Training Data: AVA v2.1 - Training Resources: 8 GPUs - Modality: RGB - Results: - - Dataset: AVA v2.1 - Task: Action Detection - Metrics: - mAP: 24.82 - Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth - - - Name: slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb - Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py - In Collection: AVA - Metadata: - Architecture: ResNet50 - Batch Size: 16 - Epochs: 20 - Pretrained: Kinetics-400 - Resolution: short-side 320 - Training Data: AVA v2.1 - Training Resources: 8 GPUs - Modality: RGB - Results: - - Dataset: AVA v2.1 - Task: Action Detection - Metrics: - mAP: 24.27 - Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth - - - Name: slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb - Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py - In Collection: AVA - Metadata: - Architecture: ResNet50 - Batch Size: 16 - Epochs: 20 - Pretrained: Kinetics-400 - Resolution: short-side 320 - Training Data: AVA v2.1 - Training Resources: 8 GPUs - Modality: RGB - Results: - - Dataset: AVA v2.1 - Task: Action Detection - Metrics: - mAP: 25.25 - Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth - - - Name: slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb - Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py - In Collection: AVA - Metadata: - Architecture: ResNet50 - Batch Size: 8 - Epochs: 20 - Pretrained: Kinetics-400 - Resolution: short-side 320 - Training Data: AVA v2.1 - Training Resources: 8 GPUs - Modality: RGB - Results: - - Dataset: AVA v2.1 - Task: Action Detection - Metrics: - mAP: 25.73 - Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth - - - Name: slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb - Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py - In Collection: AVA - Metadata: - Architecture: ResNet50 - Batch Size: 6 - Epochs: 20 - Pretrained: Kinetics-400 - Resolution: short-side 320 - Training Data: AVA v2.2 - Training Resources: 8 GPUs - Modality: RGB - Results: - - Dataset: AVA v2.2 - Task: Action Detection - Metrics: - mAP: 25.98 - Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth - - - Name: slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb - Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py - In Collection: AVA - Metadata: - Architecture: ResNet50 - Batch Size: 6 - Epochs: 20 - Pretrained: Kinetics-400 - Resolution: short-side 320 - Training Data: AVA v2.2 - Training Resources: 8 GPUs - Modality: RGB - Results: - - Dataset: AVA v2.2 - Task: Action Detection - Metrics: - mAP: 26.38 - Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth - - - Name: slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb - Config: configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py - In Collection: AVA - Metadata: - Architecture: ResNet50 - Batch Size: 6 - Epochs: 20 - Pretrained: Kinetics-400 - Resolution: short-side 320 - Training Data: AVA v2.2 - Training Resources: 8 GPUs - Modality: RGB - Results: - - Dataset: AVA v2.2 - Task: Action Detection - Metrics: - mAP: 26.59 - Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py b/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py deleted file mode 100644 index 97e0197a6e..0000000000 --- a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py +++ /dev/null @@ -1,50 +0,0 @@ -_base_ = ['slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py'] - -model = dict( - backbone=dict( - resample_rate=4, - speed_ratio=4, - slow_pathway=dict(fusion_kernel=7), - pretrained=( - 'https://download.openmmlab.com/mmaction/recognition/slowfast/' - 'slowfast_r50_8x8x1_256e_kinetics400_rgb/' - 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth'))) - -dataset_type = 'AVADataset' -data_root = 'data/ava/rawframes' -anno_root = 'data/ava/annotations' - -ann_file_train = f'{anno_root}/ava_train_v2.1.csv' -exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' -label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' - -proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' - 'recall_93.9.pkl') - -train_pipeline = [ - dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), - dict(type='RawFrameDecode'), - dict(type='RandomRescale', scale_range=(256, 320)), - dict(type='RandomCrop', size=256), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCTHW', collapse=True), - dict(type='PackActionInputs') -] - -train_dataloader = dict( - batch_size=8, - num_workers=8, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type=dataset_type, - ann_file=ann_file_train, - exclude_file=exclude_file_train, - pipeline=train_pipeline, - label_file=label_file, - proposal_file=proposal_file_train, - data_prefix=dict(img=data_root))) - -optim_wrapper = dict( - optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001), - clip_grad=dict(max_norm=40, norm_type=2)) diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py deleted file mode 100644 index 815e61c2fc..0000000000 --- a/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py +++ /dev/null @@ -1,72 +0,0 @@ -_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py'] - -model = dict( - backbone=dict( - depth=101, - pretrained=( - 'https://download.openmmlab.com/mmaction/recognition/slowonly/' - 'omni/slowonly_r101_without_omni_8x8x1_kinetics400_rgb_' - '20200926-0c730aef.pth'))) - -dataset_type = 'AVADataset' -data_root = 'data/ava/rawframes' -anno_root = 'data/ava/annotations' - -ann_file_train = f'{anno_root}/ava_train_v2.1.csv' -ann_file_val = f'{anno_root}/ava_val_v2.1.csv' - -exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' -exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' - -label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' - -proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' - 'recall_93.9.pkl') -proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' - -train_pipeline = [ - dict(type='SampleAVAFrames', clip_len=8, frame_interval=8), - dict(type='RawFrameDecode'), - dict(type='RandomRescale', scale_range=(256, 320)), - dict(type='RandomCrop', size=256), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCTHW', collapse=True), - dict(type='PackActionInputs') -] -# The testing is w/o. any cropping / flipping -val_pipeline = [ - dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='FormatShape', input_format='NCTHW', collapse=True), - dict(type='PackActionInputs') -] - -train_dataloader = dict( - batch_size=16, - num_workers=8, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type=dataset_type, - ann_file=ann_file_train, - exclude_file=exclude_file_train, - pipeline=train_pipeline, - label_file=label_file, - proposal_file=proposal_file_train, - data_prefix=dict(img=data_root))) -val_dataloader = dict( - batch_size=1, - num_workers=8, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - ann_file=ann_file_val, - exclude_file=exclude_file_val, - pipeline=val_pipeline, - label_file=label_file, - proposal_file=proposal_file_val, - data_prefix=dict(img=data_root), - test_mode=True)) -test_dataloader = val_dataloader diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py deleted file mode 100644 index 43b0fa1a28..0000000000 --- a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py +++ /dev/null @@ -1,16 +0,0 @@ -_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py'] - -model = dict( - backbone=dict( - pretrained=( - 'https://download.openmmlab.com/mmaction/recognition/slowonly/' - 'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb/' - 'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb_' - '20210308-0d6e5a69.pth'), - norm_cfg=dict(type='BN3d', requires_grad=True), - non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), - non_local_cfg=dict( - sub_sample=True, - use_scale=True, - norm_cfg=dict(type='BN3d', requires_grad=True), - mode='embedded_gaussian'))) diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py deleted file mode 100644 index a962f10c11..0000000000 --- a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py +++ /dev/null @@ -1,74 +0,0 @@ -_base_ = [ - 'slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py' -] - -model = dict( - backbone=dict( - pretrained=( - 'https://download.openmmlab.com/mmaction/recognition/slowonly/' - 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb/' - 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb_' - '20210308-e8dd9e82.pth'))) - -dataset_type = 'AVADataset' -data_root = 'data/ava/rawframes' -anno_root = 'data/ava/annotations' - -ann_file_train = f'{anno_root}/ava_train_v2.1.csv' -ann_file_val = f'{anno_root}/ava_val_v2.1.csv' - -exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' -exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' - -label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' - -proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' - 'recall_93.9.pkl') -proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' - -train_pipeline = [ - dict(type='SampleAVAFrames', clip_len=8, frame_interval=8), - dict(type='RawFrameDecode'), - dict(type='RandomRescale', scale_range=(256, 320)), - dict(type='RandomCrop', size=256), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCTHW', collapse=True), - dict(type='PackActionInputs') -] -# The testing is w/o. any cropping / flipping -val_pipeline = [ - dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='FormatShape', input_format='NCTHW', collapse=True), - dict(type='PackActionInputs') -] - -train_dataloader = dict( - batch_size=16, - num_workers=8, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), - dataset=dict( - type=dataset_type, - ann_file=ann_file_train, - exclude_file=exclude_file_train, - pipeline=train_pipeline, - label_file=label_file, - proposal_file=proposal_file_train, - data_prefix=dict(img=data_root))) -val_dataloader = dict( - batch_size=1, - num_workers=8, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - ann_file=ann_file_val, - exclude_file=exclude_file_val, - pipeline=val_pipeline, - label_file=label_file, - proposal_file=proposal_file_val, - data_prefix=dict(img=data_root), - test_mode=True)) -test_dataloader = val_dataloader diff --git a/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py deleted file mode 100644 index c9e10def96..0000000000 --- a/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py +++ /dev/null @@ -1,9 +0,0 @@ -_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py'] - -model = dict( - backbone=dict( - pretrained=( - 'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly' - '/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' - 'kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-' - 'steplr-150e_kinetics700-rgb_20220901-f73b3e89.pth'))) diff --git a/configs/detection/ava_kinetics/README.md b/configs/detection/ava_kinetics/README.md deleted file mode 100644 index 59ec345c43..0000000000 --- a/configs/detection/ava_kinetics/README.md +++ /dev/null @@ -1,103 +0,0 @@ -# AVA - -[The AVA-Kinetics Localized Human Actions Video Dataset](https://arxiv.org/abs/2005.00214) - - - -
- -
- -## Abstract - - - -This paper describes the AVA-Kinetics localized human actions video dataset. The dataset is collected by annotating videos from the Kinetics-700 dataset using the AVA annotation protocol, and extending the original AVA dataset with these new AVA annotated Kinetics clips. The dataset contains over 230k clips annotated with the 80 AVA action classes for each of the humans in key-frames. We describe the annotation process and provide statistics about the new dataset. We also include a baseline evaluation using the Video Action Transformer Network on the AVA-Kinetics dataset, demonstrating improved performance for action classification on the AVA test set. - -```BibTeX -@article{li2020ava, - title={The ava-kinetics localized human actions video dataset}, - author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew}, - journal={arXiv preprint arXiv:2005.00214}, - year={2020} -} -``` - -## Results and Models - -### AVA2.2 - -Currently, we only use the training set of AVA-Kinetics and evaluate on the AVA2.2 validation dataset. The AVA-Kinetics validation dataset will be supported soon. - -| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | config | ckpt | log | -| :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :------------------------------------------: | :-----------------------------------------: | :----------------------------------------: | -| 4x16x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-400 | 24.53 | [config](/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-33e3ca7c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) | -| 4x16x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 25.87 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-a07e8c15.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) | -| 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-400 | 26.10 | [config](/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-8f8dff3b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | -| 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | - -### Training with tricks - -We conduct ablation studies to show the improvements of training tricks using SlowOnly8x8 pretrained on the Kinetics700 dataset. The baseline is the last raw in [AVA2.2](https://github.com/hukkai/mmaction2/tree/ava-kinetics-exp/configs/detection/ava_kinetics#ava22). - -| method | frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | config | ckpt | log | -| :--------------------: | :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :-----------------------------------: | :---------------------------------: | :---------------------------------: | -| baseline | 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | -| + context | 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 28.31 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5d514f8c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | -| + temporal max pooling | 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 28.48 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5b5e71eb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | -| + nonlinear head | 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 29.83 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-87624265.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | -| + focal loss | 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 30.33 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb_20221205-37aa8395.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.log) | -| + more frames | 16x4x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 31.29 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb_20221205-dd652f81.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.log) | - -Note: - -The **gpus** indicates the number of gpu we used to get the checkpoint; **+ context** indicates that using both RoI feature and global pooled feature for classification; **+ temporal max pooling** indicates that using max pooling in the temporal dimension for the feature; **nonlinear head** indicates that using a 2-layer mlp instead of a linear classifier. - -For more details on data preparation, you can refer to [AVA-Kinetics Data Preparation](/tools/data/ava_kinetics/README.md). - -## Train - -You can use the following command to train a model. - -```shell -python tools/train.py ${CONFIG_FILE} [optional arguments] -``` - -Example: train the SlowOnly model on AVA-Kinetics in a deterministic option. - -```shell -python tools/train.py configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py \ - --cfg-options randomness.seed=0 randomness.deterministic=True -``` - -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). - -## Test - -You can use the following command to test a model. - -```shell -python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] -``` - -Example: test the SlowOnly model on AVA-Kinetics and dump the result to a pkl file. - -```shell -python tools/test.py configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py \ - checkpoints/SOME_CHECKPOINT.pth --dump result.pkl -``` - -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). - -## Citation - - - -```BibTeX -@article{li2020ava, - title={The ava-kinetics localized human actions video dataset}, - author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew}, - journal={arXiv preprint arXiv:2005.00214}, - year={2020} -} -``` diff --git a/configs/detection/slowfast/README.md b/configs/detection/slowfast/README.md new file mode 100644 index 0000000000..bae71fd040 --- /dev/null +++ b/configs/detection/slowfast/README.md @@ -0,0 +1,96 @@ +# SlowFast + +[Slowfast networks for video recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html) + + + +## Abstract + + + +We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA. + + + +
+ +
+ +## Results and Models + +### AVA2.1 + +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :------------------------------: | :----------: | :---: | :-----------------------------------------: | :---------------------------------------: | :--------------------------------------: | +| 4x16x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 24.32 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) | +| 4x16x1 | 8 | SlowFast ResNet50 (with context) | Kinetics-400 | 25.34 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log) | +| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 25.80 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log) | + +### AVA2.2 + +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :---------------------------------------: | :----------: | :---: | :--------------------------------------: | :------------------------------------: | :-----------------------------------: | +| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 25.90 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log) | +| 8x8x1 | 8 | SlowFast ResNet50 (temporal-max) | Kinetics-400 | 26.41 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log) | +| 8x8x1 | 8 | SlowFast ResNet50 (temporal-max, focal loss) | Kinetics-400 | 26.65 | [config](/configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. **with context** indicates that using both RoI feature and global pooled feature for classification; **temporal-max** indicates that using max pooling in the temporal dimension for the feature. + +For more details on data preparation, you can refer to [AVA](/tools/data/ava/README.md). + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train the SlowFast model on AVA2.1 in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test the SlowFast model on AVA2.1 and dump the result to a pkl file. + +```shell +python tools/test.py configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). + +## Citation + +```BibTeX +@inproceedings{feichtenhofer2019slowfast, + title={Slowfast networks for video recognition}, + author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming}, + booktitle={ICCV}, + pages={6202--6211}, + year={2019} +} +``` + +```BibTeX +@inproceedings{gu2018ava, + title={Ava: A video dataset of spatio-temporally localized atomic visual actions}, + author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others}, + booktitle={CVPR}, + pages={6047--6056}, + year={2018} +} +``` diff --git a/configs/detection/slowfast/metafile.yml b/configs/detection/slowfast/metafile.yml new file mode 100644 index 0000000000..2ab6c44a45 --- /dev/null +++ b/configs/detection/slowfast/metafile.yml @@ -0,0 +1,121 @@ +Collections: + - Name: SlowFast + README: configs/detection/slowfast/README.md + Paper: + URL: https://arxiv.org/abs/1812.03982 + Title: 'SlowFast Networks for Video Recognition' + +Models: + - Name: slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb + Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 24.32 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth + + - Name: slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb + Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 25.34 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth + + - Name: slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb + Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 8 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 25.80 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth + + - Name: slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb + Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 6 + Epochs: 10 + Pretrained: Kinetics-400 + Training Data: AVA v2.2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.2 + Task: Action Detection + Metrics: + mAP: 25.90 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth + + - Name: slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb + Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 6 + Epochs: 10 + Pretrained: Kinetics-400 + Training Data: AVA v2.2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.2 + Task: Action Detection + Metrics: + mAP: 26.41 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth + + - Name: slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb + Config: configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py + In Collection: SlowFast + Metadata: + Architecture: ResNet50 + Batch Size: 6 + Epochs: 10 + Pretrained: Kinetics-400 + Training Data: AVA v2.2 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.2 + Task: Action Detection + Metrics: + mAP: 26.65 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py similarity index 100% rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py similarity index 100% rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py similarity index 52% rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py index 8b5550aec0..0eb0e501e3 100644 --- a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py +++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py @@ -1,14 +1,16 @@ -_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py'] +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/' + 'slowfast_r50_4x16x1_256e_kinetics400_rgb/' + 'slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth') model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), backbone=dict( - _delete_=True, - type='ResNet3dSlowFast', - _scope_='mmaction', - pretrained=( - 'https://download.openmmlab.com/mmaction/recognition/slowfast/' - 'slowfast_r50_4x16x1_256e_kinetics400_rgb/' - 'slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth'), + type='mmaction.ResNet3dSlowFast', + pretrained=None, resample_rate=8, speed_ratio=8, channel_ratio=8, @@ -33,7 +35,39 @@ conv1_stride_t=1, pool1_stride_t=1, spatial_strides=(1, 2, 2, 1))), - roi_head=dict(bbox_head=dict(in_channels=2304))) + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2304, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) dataset_type = 'AVADataset' data_root = 'data/ava/rawframes' @@ -51,9 +85,10 @@ 'recall_93.9.pkl') proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' +file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='RandomRescale', scale_range=(256, 320)), dict(type='RandomCrop', size=256), dict(type='Flip', flip_ratio=0.5), @@ -65,7 +100,7 @@ val_pipeline = [ dict( type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict(type='FormatShape', input_format='NCTHW', collapse=True), dict(type='PackActionInputs') @@ -99,3 +134,36 @@ data_prefix=dict(img=data_root), test_mode=True)) test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py similarity index 52% rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py index a7f4c09ed1..debeb5c7fd 100644 --- a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py +++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py @@ -1,11 +1,74 @@ -_base_ = ['slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py'] +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth') model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), backbone=dict( - pretrained=( - 'https://download.openmmlab.com/mmaction/recognition/slowfast/' - 'slowfast_r50_8x8x1_256e_kinetics400_rgb/' - 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth'))) + type='mmaction.ResNet3dSlowFast', + resample_rate=4, + speed_ratio=4, + channel_ratio=8, + pretrained=None, + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + spatial_strides=(1, 2, 2, 1), + fusion_kernel=7), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1))), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2304, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) dataset_type = 'AVADataset' data_root = 'data/ava/rawframes' @@ -23,9 +86,10 @@ 'recall_93.9.pkl') proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' +file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='RandomRescale', scale_range=(256, 320)), dict(type='RandomCrop', size=256), dict(type='Flip', flip_ratio=0.5), @@ -36,7 +100,7 @@ val_pipeline = [ dict( type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict(type='FormatShape', input_format='NCTHW', collapse=True), dict(type='PackActionInputs') @@ -80,6 +144,8 @@ train_cfg = dict( type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') param_scheduler = [ dict( @@ -102,3 +168,9 @@ optim_wrapper = dict( optimizer=dict(type='SGD', lr=0.075, momentum=0.9, weight_decay=0.00001), clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (6 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=48) diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py new file mode 100644 index 0000000000..1e94a10960 --- /dev/null +++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py @@ -0,0 +1,171 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb/' + 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowFast', + resample_rate=4, + speed_ratio=4, + channel_ratio=8, + pretrained=None, + slow_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=True, + conv1_kernel=(1, 7, 7), + dilations=(1, 1, 1, 1), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + spatial_strides=(1, 2, 2, 1), + fusion_kernel=7), + fast_pathway=dict( + type='resnet3d', + depth=50, + pretrained=None, + lateral=False, + base_channels=8, + conv1_kernel=(5, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1))), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2304, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=32, frame_interval=2), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=64) diff --git a/configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py similarity index 100% rename from configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py rename to configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py diff --git a/configs/detection/slowonly/README.md b/configs/detection/slowonly/README.md new file mode 100644 index 0000000000..e8af3d84ea --- /dev/null +++ b/configs/detection/slowonly/README.md @@ -0,0 +1,126 @@ +# SlowOnly + +[Slowfast networks for video recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html) + + + +## Abstract + + + +We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA. + + + +
+ +
+ +## Results and Models + +### AVA2.1 + +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :------------------------------------: | :----------: | :---: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: | +| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-400 | 20.72 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) | +| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 22.77 | [config](/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) | +| 4x16x1 | 8 | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 21.55 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log) | +| 8x8x1 | 8 | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 23.77 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log) | +| 8x8x1 | 8 | SlowOnly ResNet101 | Kinetics-400 | 24.83 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log) | + +### AVA2.2 (Trained on AVA-Kinetics) + +Currently, we only use the training set of AVA-Kinetics and evaluate on the AVA2.2 validation dataset. The AVA-Kinetics validation dataset will be supported soon. + +| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: | +| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-400 | 24.53 | [config](/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-33e3ca7c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) | +| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 25.87 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-a07e8c15.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) | +| 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-400 | 26.10 | [config](/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-8f8dff3b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | +| 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | + +### AVA2.2 (Trained on AVA-Kinetics with tricks) + +We conduct ablation studies to show the improvements of training tricks using SlowOnly8x8 pretrained on the Kinetics700 dataset. The baseline is the last row in **AVA2.2 (Trained on AVA-Kinetics)**. + +| method | frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log | +| :--------------------: | :---------------------: | :--: | :---------------: | :----------: | :---: | :--------------------------------------: | :-------------------------------------: | :------------------------------------: | +| baseline | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | +| + context | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 28.31 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5d514f8c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | +| + temporal max pooling | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 28.48 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5b5e71eb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | +| + nonlinear head | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 29.83 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-87624265.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.log) | +| + focal loss | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 30.33 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb_20221205-37aa8395.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.log) | +| + more frames | 16x4x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 31.29 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb_20221205-dd652f81.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.log) | + +1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. +2. **+ context** indicates that using both RoI feature and global pooled feature for classification; **+ temporal max pooling** indicates that using max pooling in the temporal dimension for the feature; **nonlinear head** indicates that using a 2-layer mlp instead of a linear classifier. + +For more details on data preparation, you can refer to + +- [AVA](/tools/data/ava/README.md) +- [AVA-Kinetics](/tools/data/ava_kinetics/README.md) + +## Train + +You can use the following command to train a model. + +```shell +python tools/train.py ${CONFIG_FILE} [optional arguments] +``` + +Example: train the SlowOnly model on AVA2.1 in a deterministic option with periodic validation. + +```shell +python tools/train.py configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \ + --seed 0 --deterministic +``` + +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). + +## Test + +You can use the following command to test a model. + +```shell +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments] +``` + +Example: test the SlowOnly model on AVA2.1 and dump the result to a pkl file. + +```shell +python tools/test.py configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \ + checkpoints/SOME_CHECKPOINT.pth --dump result.pkl +``` + +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). + +## Citation + +```BibTeX +@inproceedings{feichtenhofer2019slowfast, + title={Slowfast networks for video recognition}, + author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming}, + booktitle={ICCV}, + pages={6202--6211}, + year={2019} +} +``` + +```BibTeX +@inproceedings{gu2018ava, + title={Ava: A video dataset of spatio-temporally localized atomic visual actions}, + author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others}, + booktitle={CVPR}, + pages={6047--6056}, + year={2018} +} +``` + +```BibTeX +@article{li2020ava, + title={The ava-kinetics localized human actions video dataset}, + author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew}, + journal={arXiv preprint arXiv:2005.00214}, + year={2020} +} +``` diff --git a/configs/detection/slowonly/metafile.yml b/configs/detection/slowonly/metafile.yml new file mode 100644 index 0000000000..11ca749351 --- /dev/null +++ b/configs/detection/slowonly/metafile.yml @@ -0,0 +1,102 @@ +Collections: + - Name: SlowOnly + README: configs/detection/slowonly/README.md + Paper: + URL: https://arxiv.org/abs/1812.03982 + Title: 'SlowFast Networks for Video Recognition' + +Models: + - Name: slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb + Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 20.72 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth + + - Name: slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb + Config: configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-700 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 22.77 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth + + - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb + Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 21.55 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth + + - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb + Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet50 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 23.77 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth + + - Name: slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb + Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py + In Collection: SlowOnly + Metadata: + Architecture: ResNet101 + Batch Size: 16 + Epochs: 20 + Pretrained: Kinetics-400 + Training Data: AVA v2.1 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: AVA v2.1 + Task: Action Detection + Metrics: + mAP: 24.83 + Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth diff --git a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py similarity index 65% rename from configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py rename to configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py index 7407ec6978..fd44f336ac 100644 --- a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py +++ b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py @@ -1,6 +1,58 @@ -_base_ = [ - '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py' -] +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-' + 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics400-rgb_20220901-e7b65fad.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) dataset_type = 'AVAKineticsDataset' data_root = 'data/ava_kinetics/rawframes' @@ -18,14 +70,7 @@ 'recall_93.9.pkl') proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict({ -# 'data/ava_kinetics/rawframes/': -# 's3://openmmlab/datasets/action/ava/rawframes/' -# })) file_client_args = dict(io_backend='disk') - train_pipeline = [ dict(type='SampleAVAFrames', clip_len=4, frame_interval=16), dict(type='RawFrameDecode', **file_client_args), diff --git a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py similarity index 70% rename from configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py rename to configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py index eb393d3a8c..4af750e8ad 100644 --- a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py +++ b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py @@ -1,13 +1,58 @@ -_base_ = [ - '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py' -] +_base_ = '../../_base_/default_runtime.py' url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' 'slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-' 'rgb/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_' 'kinetics400-rgb_20220901-df42dc84.pth') -model = dict(init_cfg=dict(type='Pretrained', checkpoint=url)) +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) dataset_type = 'AVAKineticsDataset' data_root = 'data/ava_kinetics/rawframes' @@ -25,14 +70,7 @@ 'recall_93.9.pkl') proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict({ -# 'data/ava_kinetics/rawframes/': -# 's3://openmmlab/datasets/action/ava/rawframes/' -# })) file_client_args = dict(io_backend='disk') - train_pipeline = [ dict(type='SampleAVAFrames', clip_len=8, frame_interval=8), dict(type='RawFrameDecode', **file_client_args), diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py similarity index 100% rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py rename to configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py similarity index 100% rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py rename to configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py similarity index 100% rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py rename to configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py similarity index 100% rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py rename to configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py similarity index 85% rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py rename to configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py index 4d4a3dea6b..a757f731a4 100644 --- a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py +++ b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py @@ -1,14 +1,6 @@ -_base_ = [ - '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py' -] - -url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' - 'slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-' - 'rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_' - 'kinetics700-rgb_20221013-15b93b10.pth') +_base_ = ['slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'] model = dict( - init_cfg=dict(type='Pretrained', checkpoint=url), roi_head=dict( bbox_roi_extractor=dict(with_global=True, temporal_pool_mode='max'), bbox_head=dict(in_channels=4096, mlp_head=True, focal_gamma=1.0))) @@ -29,14 +21,7 @@ 'recall_93.9.pkl') proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' -# file_client_args = dict( -# io_backend='petrel', -# path_mapping=dict({ -# 'data/ava_kinetics/rawframes/': -# 's3://openmmlab/datasets/action/ava/rawframes/' -# })) file_client_args = dict(io_backend='disk') - train_pipeline = [ dict(type='SampleAVAFrames', clip_len=16, frame_interval=4), dict(type='RawFrameDecode', **file_client_args), diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py similarity index 100% rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py rename to configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py similarity index 100% rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py rename to configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py new file mode 100644 index 0000000000..9bee13a25c --- /dev/null +++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py @@ -0,0 +1,151 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/' + 'omni/slowonly_r101_without_omni_8x8x1_kinetics400_rgb_' + '20200926-0c730aef.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=101, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=8, frame_interval=8), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py new file mode 100644 index 0000000000..cdc8ea8d98 --- /dev/null +++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py @@ -0,0 +1,160 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/' + 'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb/' + 'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb_' + '20210308-0d6e5a69.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1), + norm_cfg=dict(type='BN3d', requires_grad=True), + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=True, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian')), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=4, frame_interval=16), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py new file mode 100644 index 0000000000..9b6dd00fdb --- /dev/null +++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py @@ -0,0 +1,159 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/' + 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb/' + 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb_' + '20210308-e8dd9e82.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1), + norm_cfg=dict(type='BN3d', requires_grad=True), + non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)), + non_local_cfg=dict( + sub_sample=True, + use_scale=True, + norm_cfg=dict(type='BN3d', requires_grad=True), + mode='embedded_gaussian')), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=8, frame_interval=8), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py similarity index 56% rename from configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py rename to configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py index ec107941b3..a83408c84a 100644 --- a/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py +++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py @@ -1,6 +1,58 @@ -_base_ = [ - '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py' -] +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-' + 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics400-rgb_20220901-e7b65fad.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) dataset_type = 'AVADataset' data_root = 'data/ava/rawframes' @@ -18,9 +70,10 @@ 'recall_93.9.pkl') proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' +file_client_args = dict(io_backend='disk') train_pipeline = [ dict(type='SampleAVAFrames', clip_len=4, frame_interval=16), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='RandomRescale', scale_range=(256, 320)), dict(type='RandomCrop', size=256), dict(type='Flip', flip_ratio=0.5), @@ -31,7 +84,7 @@ val_pipeline = [ dict( type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), - dict(type='RawFrameDecode'), + dict(type='RawFrameDecode', **file_client_args), dict(type='Resize', scale=(-1, 256)), dict(type='FormatShape', input_format='NCTHW', collapse=True), dict(type='PackActionInputs') @@ -92,3 +145,9 @@ optim_wrapper = dict( optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py new file mode 100644 index 0000000000..a68893a015 --- /dev/null +++ b/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py @@ -0,0 +1,153 @@ +_base_ = '../../_base_/default_runtime.py' + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly' + '/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-' + 'steplr-150e_kinetics700-rgb_20220901-f73b3e89.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5)), + data_preprocessor=dict( + type='mmaction.ActionDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_train = f'{anno_root}/ava_train_v2.1.csv' +ann_file_val = f'{anno_root}/ava_val_v2.1.csv' + +exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv' +exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv' + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.' + 'recall_93.9.pkl') +proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='SampleAVAFrames', clip_len=4, frame_interval=16), + dict(type='RawFrameDecode', **file_client_args), + dict(type='RandomRescale', scale_range=(256, 320)), + dict(type='RandomCrop', size=256), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] +# The testing is w/o. any cropping / flipping +val_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), + dict(type='RawFrameDecode', **file_client_args), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + exclude_file=exclude_file_train, + pipeline=train_pipeline, + label_file=label_file, + proposal_file=proposal_file_train, + data_prefix=dict(img=data_root))) +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + exclude_file=exclude_file_val, + pipeline=val_pipeline, + label_file=label_file, + proposal_file=proposal_file_val, + data_prefix=dict(img=data_root), + test_mode=True)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_val, + label_file=label_file, + exclude_file=exclude_file_val) +test_evaluator = val_evaluator + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[10, 15], + gamma=0.1) +] + +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001), + clip_grad=dict(max_norm=40, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/mmaction/models/backbones/resnet3d_slowonly.py b/mmaction/models/backbones/resnet3d_slowonly.py index 3a2a3a3ac0..5c1c71c4c2 100644 --- a/mmaction/models/backbones/resnet3d_slowonly.py +++ b/mmaction/models/backbones/resnet3d_slowonly.py @@ -4,12 +4,6 @@ from mmaction.registry import MODELS from .resnet3d_slowfast import ResNet3dPathway -try: - from mmdet.registry import MODELS as MMDET_MODELS - mmdet_imported = True -except (ImportError, ModuleNotFoundError): - mmdet_imported = False - @MODELS.register_module() class ResNet3dSlowOnly(ResNet3dPathway): @@ -43,7 +37,3 @@ def __init__(self, **kwargs) assert not self.lateral - - -if mmdet_imported: - MMDET_MODELS.register_module()(ResNet3dSlowOnly) diff --git a/model-index.yml b/model-index.yml index a41addf98d..ebf462e3f9 100644 --- a/model-index.yml +++ b/model-index.yml @@ -15,7 +15,8 @@ Import: - configs/recognition/trn/metafile.yml - configs/recognition/swin/metafile.yml - configs/recognition/c2d/metafile.yml -- configs/detection/ava/metafile.yml +- configs/detection/slowfast/metafile.yml +- configs/detection/slowonly/metafile.yml - configs/detection/acrn/metafile.yml - configs/skeleton/stgcn/metafile.yml - configs/skeleton/2s-agcn/metafile.yml From 8b9313a06f727672cd1c5e3d67de90a277b5019e Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Thu, 30 Mar 2023 10:53:37 +0800 Subject: [PATCH 18/36] [Doc]: Add more social networking links (#2321) --- README.md | 8 +++++++- README_zh-CN.md | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f3a575f4ce..d08d49d2c3 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@
- + @@ -46,6 +46,12 @@ + + + + + +
English | [简体中文](/README_zh-CN.md) diff --git a/README_zh-CN.md b/README_zh-CN.md index 5d0d091cd1..c2ffb09702 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -36,7 +36,7 @@
- + @@ -46,6 +46,12 @@ + + + + + +
[English](/README.md) | 简体中文 From f30e8a45c3819037d617722742e6b14145372e70 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Thu, 30 Mar 2023 10:56:38 +0800 Subject: [PATCH 19/36] [Fix] Fix accepting an unexpected argument local-rank in PyTorch 2.0 (#2320) --- tools/misc/clip_feature_extraction.py | 5 ++++- tools/test.py | 2 +- tools/train.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/misc/clip_feature_extraction.py b/tools/misc/clip_feature_extraction.py index 1829bf9b5c..a7a3e67635 100644 --- a/tools/misc/clip_feature_extraction.py +++ b/tools/misc/clip_feature_extraction.py @@ -59,7 +59,10 @@ def parse_args(): choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') - parser.add_argument('--local_rank', type=int, default=0) + # When using PyTorch version >= 2.0.0, the `torch.distributed.launch` + # will pass the `--local-rank` parameter to `tools/train.py` instead + # of `--local_rank`. + parser.add_argument('--local_rank', '--local-rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) diff --git a/tools/test.py b/tools/test.py index 0d0d4bd20f..4f310fa9e0 100644 --- a/tools/test.py +++ b/tools/test.py @@ -51,7 +51,7 @@ def parse_args(): choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') - parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument('--local_rank', '--local-rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) diff --git a/tools/train.py b/tools/train.py index 2c51c50709..e43078ddb8 100644 --- a/tools/train.py +++ b/tools/train.py @@ -56,7 +56,7 @@ def parse_args(): choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') - parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument('--local_rank', '--local-rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) From 264836c8bcfcc0010aa98044062a6ffae9645449 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Fri, 31 Mar 2023 10:55:42 +0800 Subject: [PATCH 20/36] [doc] add opendatalab kinetics link (#2292) --- tools/data/kinetics/README.md | 13 ++++++++++++- tools/data/kinetics/README_zh-CN.md | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/data/kinetics/README.md b/tools/data/kinetics/README.md index 4fc7b6bb1e..0df8f8634f 100644 --- a/tools/data/kinetics/README.md +++ b/tools/data/kinetics/README.md @@ -24,6 +24,8 @@ Because of the expirations of some YouTube links, the sizes of kinetics dataset | Dataset | training videos | validation videos | | :---------: | :-------------: | :---------------: | | kinetics400 | 240436 | 19796 | +| Kinetics600 | 383393 | 27910 | +| Kinetics700 | 542357 | 34824 | ::: @@ -46,7 +48,16 @@ bash download_backup_annotations.sh ${DATASET} ## Step 2. Prepare Videos -Then, you can run the following script to prepare videos. +### Option 1: Download from OpenDataLab + +**Recommend**: [OpenDataLab](https://opendatalab.com/) provides the Kinetics dataset ([Kinetics400](https://opendatalab.com/Kinetics-400), [Kinetics600](https://opendatalab.com/Kinetics600), [Kinetics700](https://opendatalab.com/Kinetics_700)), users can download Kinetics dataset with short edge 320 pixels from here. + +:::{note} +All experiments on Kinetics in MMAction2 are based on this version, we recommend users to try this version. + +### Option 2: Download from Other Source + +you can run the following script to prepare videos. The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time. ```shell diff --git a/tools/data/kinetics/README_zh-CN.md b/tools/data/kinetics/README_zh-CN.md index e307b9e7f5..86cb65239e 100644 --- a/tools/data/kinetics/README_zh-CN.md +++ b/tools/data/kinetics/README_zh-CN.md @@ -18,11 +18,14 @@ 请参照 [官方网站](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) 以获取数据集基本信息。此脚本用于准备数据集 kinetics400,kinetics600,kinetics700。为准备 kinetics 数据集的不同版本,用户需将脚本中的 `${DATASET}` 赋值为数据集对应版本名称,可选项为 `kinetics400`,`kinetics600`, `kinetics700`。 在开始之前,用户需确保当前目录为 `$MMACTION2/tools/data/${DATASET}/`。 -**注**:由于部分 YouTube 链接失效,爬取的 Kinetics 数据集大小可能与原版不同。以下是我们所使用 Kinetics 数据集的大小: +:::{note} +由于部分 YouTube 链接失效,爬取的 Kinetics 数据集大小可能与原版不同。以下是我们所使用 Kinetics 数据集的大小: | 数据集 | 训练视频 | 验证集视频 | | :---------: | :------: | :--------: | -| kinetics400 | 240436 | 19796 | +| Kinetics400 | 240436 | 19796 | +| Kinetics600 | 383393 | 27910 | +| Kinetics700 | 542357 | 34824 | ## 1. 准备标注文件 @@ -42,6 +45,15 @@ bash download_backup_annotations.sh ${DATASET} ## 2. 准备视频 +### 选项 1: 从 OpenDataLab 下载 + +**推荐**:[OpenDataLab](https://opendatalab.com/) 提供了 Kinetics 数据集 ([Kinetics400](https://opendatalab.com/Kinetics-400), [Kinetics600](https://opendatalab.com/Kinetics600), [Kinetics700](https://opendatalab.com/Kinetics_700)), 用户可以从这里下载短边长度为 320 的 Kinetics 数据集。 + +:::{note} +MMAction2 代码仓库中提供的 Kinetics 实验性能,都是基于这个版本的数据得到的。我们建议用户使用这个版本的 Kinetics 数据集进行实验。 + +### 选项 2:从其他数据源下载 + 用户可以使用以下脚本准备视频,视频准备代码修改自 [官方爬虫](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)。注意这一步骤将花费较长时间。 ```shell From ebf4d012be4fb413af03b90400827ba839cd1c0d Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Tue, 4 Apr 2023 14:48:52 +0800 Subject: [PATCH 21/36] [Fix] fix mobilenetv2_tsm (#2332) --- .circleci/test.yml | 1 - ...lenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py | 125 ++++++++++++++++++ mmaction/models/backbones/mobilenet_v2_tsm.py | 8 +- mmaction/models/backbones/resnet_tsm.py | 1 + tests/models/recognizers/test_recognizer2d.py | 9 ++ 5 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py diff --git a/.circleci/test.yml b/.circleci/test.yml index 3984767a12..aafba494dd 100644 --- a/.circleci/test.yml +++ b/.circleci/test.yml @@ -44,7 +44,6 @@ jobs: - run: name: Install Libraries command: | - sudo add-apt-repository ppa:savoury1/ffmpeg4 sudo apt-get update sudo apt-get upgrade sudo apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libturbojpeg pkg-config diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py new file mode 100644 index 0000000000..32c276647f --- /dev/null +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py @@ -0,0 +1,125 @@ +_base_ = [ + '../../_base_/models/tsm_mobilenet_v2.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') + +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1, + num_fixed_crops=13), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=8, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='FormatShape', input_format='NCHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = val_evaluator + +default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +param_scheduler = [ + dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5), + dict( + type='MultiStepLR', + begin=0, + end=50, + by_epoch=True, + milestones=[25, 45], + gamma=0.1) +] + +optim_wrapper = dict( + constructor='TSMOptimWrapperConstructor', + paramwise_cfg=dict(fc_lr5=True), + optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001), + clip_grad=dict(max_norm=20, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (16 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=128) diff --git a/mmaction/models/backbones/mobilenet_v2_tsm.py b/mmaction/models/backbones/mobilenet_v2_tsm.py index db2999a8b3..2df95ab47c 100644 --- a/mmaction/models/backbones/mobilenet_v2_tsm.py +++ b/mmaction/models/backbones/mobilenet_v2_tsm.py @@ -21,6 +21,8 @@ def __init__(self, num_segments=8, is_shift=True, shift_div=8, **kwargs): self.num_segments = num_segments self.is_shift = is_shift self.shift_div = shift_div + super().init_weights() + self.init_structure() def make_temporal_shift(self): """Make temporal shift for some layers.""" @@ -33,9 +35,11 @@ def make_temporal_shift(self): shift_div=self.shift_div, ) - def init_weights(self): + def init_structure(self): """Initiate the parameters either from existing checkpoint or from scratch.""" - super().init_weights() if self.is_shift: self.make_temporal_shift() + + def init_weights(self): + pass diff --git a/mmaction/models/backbones/resnet_tsm.py b/mmaction/models/backbones/resnet_tsm.py index 1397384a97..c639e1eae6 100644 --- a/mmaction/models/backbones/resnet_tsm.py +++ b/mmaction/models/backbones/resnet_tsm.py @@ -165,6 +165,7 @@ def __init__(self, self.non_local = non_local self.non_local_stages = _ntuple(self.num_stages)(non_local) self.non_local_cfg = non_local_cfg + # TODO use convert key to load weights super().init_weights() self.init_structure() diff --git a/tests/models/recognizers/test_recognizer2d.py b/tests/models/recognizers/test_recognizer2d.py index a1c8ef4b1f..1acde7fc9c 100644 --- a/tests/models/recognizers/test_recognizer2d.py +++ b/tests/models/recognizers/test_recognizer2d.py @@ -104,11 +104,20 @@ def test_tsn(): def test_tsm(): register_all_modules() + config = get_recognizer_cfg( + 'tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py' # noqa: E501 + ) + config.model['backbone']['pretrained'] = None + + recognizer = MODELS.build(config.model) + recognizer.init_weights() + config = get_recognizer_cfg( 'tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py') config.model['backbone']['pretrained'] = None recognizer = MODELS.build(config.model) + recognizer.init_weights() input_shape = (1, 8, 3, 32, 32) demo_inputs = generate_recognizer_demo_inputs(input_shape) From e07c5e3c99a68c80cc807d83a5480f3694410fd0 Mon Sep 17 00:00:00 2001 From: LinXiaoZheng <90811472+Zheng-LinXiao@users.noreply.github.com> Date: Tue, 4 Apr 2023 14:49:50 +0800 Subject: [PATCH 22/36] [Improve] use mmengine to calculate FLOPs (#2300) --- tools/analysis_tools/get_flops.py | 49 ++++++++++--------------------- 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py index b89f5db5ad..fbec21887f 100644 --- a/tools/analysis_tools/get_flops.py +++ b/tools/analysis_tools/get_flops.py @@ -1,21 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. import argparse -import torch - -try: - from fvcore.nn import (ActivationCountAnalysis, FlopCountAnalysis, - flop_count_str, flop_count_table, parameter_count) -except ImportError: - print('You may need to install fvcore for flops computation, ' - 'and you can use `pip install -r requirements/optional.txt` ' - 'to set up the environment') -from fvcore.nn.print_model_statistics import _format_size from mmengine import Config from mmengine.registry import init_default_scope from mmaction.registry import MODELS +try: + from mmengine.analysis import get_model_complexity_info +except ImportError: + raise ImportError('Please upgrade mmcv to >0.6.2') + def parse_args(): parser = argparse.ArgumentParser(description='Get model flops and params') @@ -39,17 +34,17 @@ def main(): elif len(args.shape) == 2: input_shape = (1, 3) + tuple(args.shape) elif len(args.shape) == 4: - # n, c, h, w = args.shape + # n, c, h, w = args.shape for 2D recognizer input_shape = tuple(args.shape) elif len(args.shape) == 5: - # n, c, t, h, w = args.shape + # n, c, t, h, w = args.shape for 3D recognizer or + # n, m, t, v, c = args.shape for GCN-based recognizer input_shape = tuple(args.shape) else: raise ValueError('invalid input shape') cfg = Config.fromfile(args.config) init_default_scope(cfg.get('default_scope', 'mmaction')) - model = MODELS.build(cfg.model) model.eval() @@ -60,28 +55,14 @@ def main(): 'FLOPs counter is currently not currently supported with {}'. format(model.__class__.__name__)) - inputs = (torch.randn((1, *input_shape)), ) - flops_ = FlopCountAnalysis(model, inputs) - activations_ = ActivationCountAnalysis(model, inputs) - - flops = _format_size(flops_.total()) - activations = _format_size(activations_.total()) - params = _format_size(parameter_count(model)['']) - - flop_table = flop_count_table( - flops=flops_, - activations=activations_, - show_param_shapes=True, - ) - flop_str = flop_count_str(flops=flops_, activations=activations_) - - print('\n' + flop_str) - print('\n' + flop_table) - + analysis_results = get_model_complexity_info(model, input_shape) + flops = analysis_results['flops_str'] + params = analysis_results['params_str'] + table = analysis_results['out_table'] + print(table) split_line = '=' * 30 - print(f'{split_line}\nInput shape: {input_shape}\n' - f'Flops: {flops}\nParams: {params}\n' - f'Activation: {activations}\n{split_line}') + print(f'\n{split_line}\nInput shape: {input_shape}\n' + f'Flops: {flops}\nParams: {params}\n{split_line}') print('!!!Please be cautious if you use the results in papers. ' 'You may need to check if all ops are supported and verify that the ' 'flops computation is correct.') From b046879db24b7091cd95440d26d02ffac867f0d4 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Tue, 4 Apr 2023 15:17:07 +0800 Subject: [PATCH 23/36] [Fix] update aciton docker image to ubuntu-22.04 (#2334) --- .github/workflows/merge_stage_test.yml | 30 ++++++++++---------- .github/workflows/pr_stage_test.yml | 39 ++++++++++++++------------ 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml index 8c9862d049..0d1daed059 100644 --- a/.github/workflows/merge_stage_test.yml +++ b/.github/workflows/merge_stage_test.yml @@ -18,7 +18,7 @@ concurrency: jobs: build_cpu_py: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 strategy: matrix: python-version: [3.8, 3.9] @@ -27,9 +27,9 @@ jobs: - torch: 1.8.1 torchvision: 0.9.1 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip @@ -50,7 +50,7 @@ jobs: - name: Install unittest dependencies run: pip install -r requirements.txt - name: Install PyTorch - run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html + run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html - name: Install MMEngine run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install MMCV @@ -75,7 +75,7 @@ jobs: coverage report -m build_cpu_pt: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 strategy: matrix: python-version: [3.7] @@ -96,9 +96,9 @@ jobs: - torch: 1.12.1 torchvision: 0.13.1 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip @@ -118,7 +118,7 @@ jobs: - name: Install TurboJpeg lib run: sudo apt-get install -y libturbojpeg - name: Install PyTorch - run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html + run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html - name: Install MMEngine run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install MMCV @@ -153,7 +153,7 @@ jobs: fail_ci_if_error: false build_cu102: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 container: image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel strategy: @@ -163,9 +163,9 @@ jobs: - torch: 1.8.1 cuda: 10.2 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip @@ -201,16 +201,16 @@ jobs: TORCH_CUDA_ARCH_LIST=7.0 pip install -e . build_windows: - runs-on: ${{ matrix.os }} + runs-on: windows-2022 strategy: matrix: os: [windows-2022] python: [3.7] platform: [cpu, cu111] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} - name: Upgrade pip @@ -220,7 +220,7 @@ jobs: - name: Install lmdb run: pip install lmdb - name: Install PyTorch - run: pip install torch==1.8.1+${{matrix.platform}} torchvision==0.9.1+${{matrix.platform}} -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html + run: pip install torch==${{matrix.torch}}+${{matrix.platform}} torchvision==${{matrix.torchvision}}+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html - name: Install mmaction dependencies run: | pip install git+https://github.com/open-mmlab/mmengine.git@main diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml index 74c1145b5c..a0eb9d5d00 100644 --- a/.github/workflows/pr_stage_test.yml +++ b/.github/workflows/pr_stage_test.yml @@ -16,7 +16,7 @@ concurrency: jobs: build_cpu: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 strategy: matrix: python-version: [3.7] @@ -24,9 +24,9 @@ jobs: - torch: 1.8.1 torchvision: 0.9.1 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip @@ -40,7 +40,7 @@ jobs: - name: Install TurboJpeg lib run: sudo apt-get install -y libturbojpeg - name: Install PyTorch - run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html + run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html - name: Install MMEngine run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install MMCV @@ -77,9 +77,11 @@ jobs: fail_ci_if_error: false build_cu102: - runs-on: ubuntu-18.04 + runs-on: ubuntu-22.04 container: image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel + env: + MKL_THREADING_LAYER: GNU strategy: matrix: python-version: [3.7] @@ -87,9 +89,9 @@ jobs: - torch: 1.8.1 cuda: 10.2 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip @@ -105,9 +107,9 @@ jobs: run: | apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libturbojpeg libsndfile1 libsm6 libxrender-dev libxext6 - name: Install librosa and soundfile - run: python -m pip install librosa soundfile + run: pip install librosa soundfile - name: Install lmdb - run: python -m pip install lmdb + run: pip install lmdb - name: Install mmaction dependencies run: | pip install git+https://github.com/open-mmlab/mmengine.git@main @@ -117,12 +119,11 @@ jobs: pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x pip install -r requirements.txt - name: Install PytorchVideo - run: python -m pip install pytorchvideo + run: pip install pytorchvideo if: ${{matrix.cuda == '10.2'}} - name: Build and install run: | - python setup.py check -m -s - TORCH_CUDA_ARCH_LIST=7.0 pip install -e . + pip install -e . -v - name: Run unittests and generate coverage report run: | coverage run --branch --source mmaction -m pytest tests/ -k 'not timm' @@ -130,16 +131,18 @@ jobs: coverage report -m build_windows: - runs-on: ${{ matrix.os }} + runs-on: windows-2022 strategy: matrix: os: [windows-2022] - python: [3.7] + python-version: [3.7] + torch: [1.8.1] + torchvision: [0.9.1] platform: [cpu, cu111] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} - name: Upgrade pip @@ -151,7 +154,7 @@ jobs: - name: Install lmdb run: pip install lmdb - name: Install PyTorch - run: pip install torch==1.8.1+${{matrix.platform}} torchvision==0.9.1+${{matrix.platform}} -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html + run: pip install torch==${{matrix.torch}}+${{matrix.platform}} torchvision==${{matrix.torchvision}}+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html - name: Install timm run: python -m pip install timm - name: Install mmaction dependencies @@ -166,7 +169,7 @@ jobs: run: python -m pip install pytorchvideo - name: Build and install run: | - pip install -e . + pip install -e . -v - name: Run unittests and generate coverage report run: | pytest tests/ From 97f0e637b8c0e3be00db1d7b3be241d087b6f511 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Tue, 4 Apr 2023 16:36:39 +0800 Subject: [PATCH 24/36] [Fix] fix merge stage test (#2336) --- .github/workflows/merge_stage_test.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml index 0d1daed059..cf1f2ed10c 100644 --- a/.github/workflows/merge_stage_test.yml +++ b/.github/workflows/merge_stage_test.yml @@ -36,7 +36,6 @@ jobs: run: pip install pip --upgrade - name: Install Libraries run: | - sudo add-apt-repository ppa:savoury1/ffmpeg4 sudo apt-get update sudo apt-get upgrade sudo apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev libturbojpeg pkg-config @@ -65,7 +64,7 @@ jobs: run: pip install pytorchvideo if: ${{matrix.torchvision == '0.10.0'}} - name: Install timm - run: python -m pip install timm + run: pip install timm - name: Build and install run: rm -rf .eggs && pip install -e . - name: Run unittests and generate coverage report @@ -110,10 +109,10 @@ jobs: - name: Install lmdb run: pip install lmdb - name: Install timm - run: python -m pip install timm==0.6.7 + run: pip install timm==0.6.7 if: ${{matrix.torch == '1.6.0'}} - name: Install timm - run: python -m pip install timm + run: pip install timm if: ${{matrix.torch != '1.6.0'}} - name: Install TurboJpeg lib run: sudo apt-get install -y libturbojpeg @@ -181,9 +180,9 @@ jobs: run: | apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libturbojpeg libsndfile1 libsm6 libxrender-dev libxext6 - name: Install librosa and soundfile - run: python -m pip install librosa soundfile + run: pip install librosa soundfile - name: Install lmdb - run: python -m pip install lmdb + run: pip install lmdb - name: Install mmaction dependencies run: | pip install git+https://github.com/open-mmlab/mmengine.git@main @@ -193,12 +192,11 @@ jobs: pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x pip install -r requirements.txt - name: Install PytorchVideo - run: python -m pip install pytorchvideo + run: pip install pytorchvideo if: ${{matrix.cuda == '10.2'}} - name: Build and install run: | - python setup.py check -m -s - TORCH_CUDA_ARCH_LIST=7.0 pip install -e . + pip install -e . build_windows: runs-on: windows-2022 @@ -207,6 +205,8 @@ jobs: os: [windows-2022] python: [3.7] platform: [cpu, cu111] + torch: [1.8.1] + torchvision: [0.9.1] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python }} From 9c37c22361773ff90ffc8f4a6486cfb4526fd2e6 Mon Sep 17 00:00:00 2001 From: wxDai Date: Thu, 6 Apr 2023 11:23:45 +0800 Subject: [PATCH 25/36] [Docs] Add 20 Minutes Guide (#2325) --- docs/en/guide_to_framework.md | 760 ++++++++++++++++++++++++++++++++++ docs/en/index.rst | 1 + 2 files changed, 761 insertions(+) create mode 100644 docs/en/guide_to_framework.md diff --git a/docs/en/guide_to_framework.md b/docs/en/guide_to_framework.md new file mode 100644 index 0000000000..68f8bdfd41 --- /dev/null +++ b/docs/en/guide_to_framework.md @@ -0,0 +1,760 @@ +# A 20-Minute Guide to MMAction2 FrameWork + +In this tutorial, we will demonstrate the overall architecture of our `MMACTION2 1.0` through a step-by-step example of video action recognition. + +The structure of this tutorial is as follows: + +- [A 20-Minute Guide to MMAction2 FrameWork](#a-20-minute-guide-to-mmaction2-framework) + - [Step0: Prepare Data](#step0-prepare-data) + - [Step1: Build a Pipeline](#step1-build-a-pipeline) + - [Step2: Build a Dataset and DataLoader](#step2-build-a-dataset-and-dataloader) + - [Step3: Build a Recognizer](#step3-build-a-recognizer) + - [Step4: Build a Evaluation Metric](#step4-build-a-evaluation-metric) + - [Step5: Train and Test with Native PyTorch](#step5-train-and-test-with-native-pytorch) + - [Step6: Train and Test with MMEngine (Recommended)](#step6-train-and-test-with-mmengine-recommended) + +First, we need to initialize the `scope` for registry, to ensure that each module is registered under the scope of `mmaction`. For more detailed information about registry, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html). + +```python +from mmaction.utils import register_all_modules + +register_all_modules(init_default_scope=True) +``` + +## Step0: Prepare Data + +Please download our self-made [kinetics400_tiny](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) dataset and extract it to the `$MMACTION2/data` directory. +The directory structure after extraction should be as follows: + +``` +mmaction2 +├── data +│ ├── kinetics400_tiny +│ │ ├── kinetics_tiny_train_video.txt +│ │ ├── kinetics_tiny_val_video.txt +│ │ ├── train +│ │ │ ├── 27_CSXByd3s.mp4 +│ │ │ ├── 34XczvTaRiI.mp4 +│ │ │ ├── A-wiliK50Zw.mp4 +│ │ │ ├── ... +│ │ └── val +│ │ ├── 0pVGiAU6XEA.mp4 +│ │ ├── AQrbRSnRt8M.mp4 +│ │ ├── ... +``` + +Here are some examples from the annotation file `kinetics_tiny_train_video.txt`: + +``` +D32_1gwq35E.mp4 0 +iRuyZSKhHRg.mp4 1 +oXy-e_P_cAI.mp4 0 +34XczvTaRiI.mp4 1 +h2YqqUhnR34.mp4 0 +``` + +Each line in the file represents the annotation of a video, where the first item denotes the video filename (e.g., `D32_1gwq35E.mp4`), and the second item represents the corresponding label (e.g., label `0` for `D32_1gwq35E.mp4`). In this dataset, there are only `two` categories. + +## Step1: Build a Pipeline + +In order to `decode`, `sample`, `resize`, `crop`, `format`, and `pack` the input video and corresponding annotation, we need to design a pipeline to handle these processes. Specifically, we design seven `Transform` classes to build this video processing pipeline. Note that all `Transform` classes in OpenMMLab must inherit from the `BaseTransform` class in `mmcv`, implement the abstract method `transform`, and be registered to the `TRANSFORMS` registry. For more detailed information about data transform, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_transform.html). + +```python +import mmcv +import decord +import numpy as np +from mmcv.transforms import TRANSFORMS, BaseTransform, to_tensor +from mmaction.structures import ActionDataSample + + +@TRANSFORMS.register_module() +class VideoInit(BaseTransform): + def transform(self, results): + container = decord.VideoReader(results['filename']) + results['total_frames'] = len(container) + results['video_reader'] = container + return results + + +@TRANSFORMS.register_module() +class VideoSample(BaseTransform): + def __init__(self, clip_len, num_clips, test_mode=False): + self.clip_len = clip_len + self.num_clips = num_clips + self.test_mode = test_mode + + def transform(self, results): + total_frames = results['total_frames'] + interval = total_frames // self.clip_len + + if self.test_mode: + # Make the sampling during testing deterministic + np.random.seed(42) + + inds_of_all_clips = [] + for i in range(self.num_clips): + bids = np.arange(self.clip_len) * interval + offset = np.random.randint(interval, size=bids.shape) + inds = bids + offset + inds_of_all_clips.append(inds) + + results['frame_inds'] = np.concatenate(inds_of_all_clips) + results['clip_len'] = self.clip_len + results['num_clips'] = self.num_clips + return results + + +@TRANSFORMS.register_module() +class VideoDecode(BaseTransform): + def transform(self, results): + frame_inds = results['frame_inds'] + container = results['video_reader'] + + imgs = container.get_batch(frame_inds).asnumpy() + imgs = list(imgs) + + results['video_reader'] = None + del container + + results['imgs'] = imgs + results['img_shape'] = imgs[0].shape[:2] + return results + + +@TRANSFORMS.register_module() +class VideoResize(BaseTransform): + def __init__(self, r_size): + self.r_size = (np.inf, r_size) + + def transform(self, results): + img_h, img_w = results['img_shape'] + new_w, new_h = mmcv.rescale_size((img_w, img_h), self.r_size) + + imgs = [mmcv.imresize(img, (new_w, new_h)) + for img in results['imgs']] + results['imgs'] = imgs + results['img_shape'] = imgs[0].shape[:2] + return results + + +@TRANSFORMS.register_module() +class VideoCrop(BaseTransform): + def __init__(self, c_size): + self.c_size = c_size + + def transform(self, results): + img_h, img_w = results['img_shape'] + center_x, center_y = img_w // 2, img_h // 2 + x1, x2 = center_x - self.c_size // 2, center_x + self.c_size // 2 + y1, y2 = center_y - self.c_size // 2, center_y + self.c_size // 2 + imgs = [img[y1:y2, x1:x2] for img in results['imgs']] + results['imgs'] = imgs + results['img_shape'] = imgs[0].shape[:2] + return results + + +@TRANSFORMS.register_module() +class VideoFormat(BaseTransform): + def transform(self, results): + num_clips = results['num_clips'] + clip_len = results['clip_len'] + imgs = results['imgs'] + + # [num_clips*clip_len, H, W, C] + imgs = np.array(imgs) + # [num_clips, clip_len, H, W, C] + imgs = imgs.reshape((num_clips, clip_len) + imgs.shape[1:]) + # [num_clips, C, clip_len, H, W] + imgs = imgs.transpose(0, 4, 1, 2, 3) + + results['imgs'] = imgs + return results + + +@TRANSFORMS.register_module() +class VideoPack(BaseTransform): + def __init__(self, meta_keys=('img_shape', 'num_clips', 'clip_len')): + self.meta_keys = meta_keys + + def transform(self, results): + packed_results = dict() + inputs = to_tensor(results['imgs']) + data_sample = ActionDataSample().set_gt_labels(results['label']) + metainfo = {k: results[k] for k in self.meta_keys if k in results} + data_sample.set_metainfo(metainfo) + packed_results['inputs'] = inputs + packed_results['data_samples'] = data_sample + return packed_results +``` + +Below, we provide a code snippet (using `D32_1gwq35E.mp4 0` from the annotation file) to demonstrate how to use the pipeline. + +```python +import os.path as osp +from mmengine.dataset import Compose + +pipeline_cfg = [ + dict(type='VideoInit'), + dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False), + dict(type='VideoDecode'), + dict(type='VideoResize', r_size=256), + dict(type='VideoCrop', c_size=224), + dict(type='VideoFormat'), + dict(type='VideoPack') +] + +pipeline = Compose(pipeline_cfg) +data_prefix = 'data/kinetics400_tiny/train' +results = dict(filename=osp.join(data_prefix, 'D32_1gwq35E.mp4'), label=0) +packed_results = pipeline(results) + +inputs = packed_results['inputs'] +data_sample = packed_results['data_samples'] + +print('shape of the inputs: ', inputs.shape) + +# Get metainfo of the inputs +print('image_shape: ', data_sample.img_shape) +print('num_clips: ', data_sample.num_clips) +print('clip_len: ', data_sample.clip_len) + +# Get label of the inputs +print('label: ', data_sample.gt_labels.item) +``` + +``` +shape of the inputs: torch.Size([1, 3, 16, 224, 224]) +image_shape: (224, 224) +num_clips: 1 +clip_len: 16 +label: tensor([0]) +``` + +## Step2: Build a Dataset and DataLoader + +All `Dataset` classes in OpenMMLab must inherit from the `BaseDataset` class in `mmengine`. We can customize annotation loading process by overriding the `load_data_list` method. Additionally, we can add more information to the `results` dict that is passed as input to the `pipeline` by overriding the `get_data_info` method. For more detailed information about `BaseDataset` class, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html). + +```python +import os.path as osp +from mmengine.fileio import list_from_file +from mmengine.dataset import BaseDataset +from mmaction.registry import DATASETS + + +@DATASETS.register_module() +class DatasetZelda(BaseDataset): + def __init__(self, ann_file, pipeline, data_root, data_prefix=dict(video=''), + test_mode=False, modality='RGB', **kwargs): + self.modality = modality + super(DatasetZelda, self).__init__(ann_file=ann_file, pipeline=pipeline, data_root=data_root, + data_prefix=data_prefix, test_mode=test_mode, + **kwargs) + + def load_data_list(self): + data_list = [] + fin = list_from_file(self.ann_file) + for line in fin: + line_split = line.strip().split() + filename, label = line_split + label = int(label) + filename = osp.join(self.data_prefix['video'], filename) + data_list.append(dict(filename=filename, label=label)) + return data_list + + def get_data_info(self, idx: int) -> dict: + data_info = super().get_data_info(idx) + data_info['modality'] = self.modality + return data_info +``` + +Next, we will demonstrate how to use dataset and dataloader to index data. We will use the `Runner.build_dataloader` method to construct the dataloader. For more detailed information about dataloader, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/dataset.html#details-on-dataloader). + +```python +from mmaction.registry import DATASETS + +train_pipeline_cfg = [ + dict(type='VideoInit'), + dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False), + dict(type='VideoDecode'), + dict(type='VideoResize', r_size=256), + dict(type='VideoCrop', c_size=224), + dict(type='VideoFormat'), + dict(type='VideoPack') +] + +val_pipeline_cfg = [ + dict(type='VideoInit'), + dict(type='VideoSample', clip_len=16, num_clips=5, test_mode=True), + dict(type='VideoDecode'), + dict(type='VideoResize', r_size=256), + dict(type='VideoCrop', c_size=224), + dict(type='VideoFormat'), + dict(type='VideoPack') +] + +train_dataset_cfg = dict( + type='DatasetZelda', + ann_file='kinetics_tiny_train_video.txt', + pipeline=train_pipeline_cfg, + data_root='data/kinetics400_tiny/', + data_prefix=dict(video='train')) + +val_dataset_cfg = dict( + type='DatasetZelda', + ann_file='kinetics_tiny_val_video.txt', + pipeline=val_pipeline_cfg, + data_root='data/kinetics400_tiny/', + data_prefix=dict(video='val')) + +train_dataset = DATASETS.build(train_dataset_cfg) + +packed_results = train_dataset[0] + +inputs = packed_results['inputs'] +data_sample = packed_results['data_samples'] + +print('shape of the inputs: ', inputs.shape) + +# Get metainfo of the inputs +print('image_shape: ', data_sample.img_shape) +print('num_clips: ', data_sample.num_clips) +print('clip_len: ', data_sample.clip_len) + +# Get label of the inputs +print('label: ', data_sample.gt_labels.item) + +from mmengine.runner import Runner + +BATCH_SIZE = 2 + +train_dataloader_cfg = dict( + batch_size=BATCH_SIZE, + num_workers=0, + persistent_workers=False, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=train_dataset_cfg) + +val_dataloader_cfg = dict( + batch_size=BATCH_SIZE, + num_workers=0, + persistent_workers=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=val_dataset_cfg) + +train_data_loader = Runner.build_dataloader(dataloader=train_dataloader_cfg) +val_data_loader = Runner.build_dataloader(dataloader=val_dataloader_cfg) + +batched_packed_results = next(iter(train_data_loader)) + +batched_inputs = batched_packed_results['inputs'] +batched_data_sample = batched_packed_results['data_samples'] + +assert len(batched_inputs) == BATCH_SIZE +assert len(batched_data_sample) == BATCH_SIZE +``` + +The terminal output should be the same as the one shown in the [Step1: Build a Pipeline](#step1-build-a-pipeline). + +## Step3: Build a Recognizer + +Next, we will construct the `recognizer`, which mainly consists of three parts: `data preprocessor` for batching and normalizing the data, `backbone` for feature extraction, and `cls_head` for classification. + +The implementation of `data_preprocessor` is as follows: + +```python +import torch +from mmengine.model import BaseDataPreprocessor, stack_batch +from mmaction.registry import MODELS + + +@MODELS.register_module() +class DataPreprocessorZelda(BaseDataPreprocessor): + def __init__(self, mean, std): + super().__init__() + + self.register_buffer( + 'mean', + torch.tensor(mean, dtype=torch.float32).view(-1, 1, 1, 1), + False) + self.register_buffer( + 'std', + torch.tensor(std, dtype=torch.float32).view(-1, 1, 1, 1), + False) + + def forward(self, data, training=False): + data = self.cast_data(data) + inputs = data['inputs'] + batch_inputs = stack_batch(inputs) # Batching + batch_inputs = (batch_inputs - self.mean) / self.std # Normalization + data['inputs'] = batch_inputs + return data +``` + +Here is the usage of data_preprocessor: feed the `batched_packed_results` obtained from the [Step2: Build a Dataset and DataLoader](#step2-build-a-dataset-and-dataloader) into the `data_preprocessor` for batching and normalization. + +```python +from mmaction.registry import MODELS + +data_preprocessor_cfg = dict( + type='DataPreprocessorZelda', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375]) + +data_preprocessor = MODELS.build(data_preprocessor_cfg) + +preprocessed_inputs = data_preprocessor(batched_packed_results) +print(preprocessed_inputs['inputs'].shape) +``` + +``` +torch.Size([2, 1, 3, 16, 224, 224]) +``` + +The implementations of `backbone`, `cls_head` and `recognizer` are as follows: + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmengine.model import BaseModel, BaseModule, Sequential +from mmengine.structures import LabelData +from mmaction.registry import MODELS + + +@MODELS.register_module() +class BackBoneZelda(BaseModule): + def __init__(self, init_cfg=None): + if init_cfg is None: + init_cfg = [dict(type='Kaiming', layer='Conv3d', mode='fan_out', nonlinearity="relu"), + dict(type='Constant', layer='BatchNorm3d', val=1, bias=0)] + + super(BackBoneZelda, self).__init__(init_cfg=init_cfg) + + self.conv1 = Sequential(nn.Conv3d(3, 64, kernel_size=(3, 7, 7), + stride=(1, 2, 2), padding=(1, 3, 3)), + nn.BatchNorm3d(64), nn.ReLU()) + self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), + padding=(0, 1, 1)) + + self.conv = Sequential(nn.Conv3d(64, 128, kernel_size=3, stride=2, padding=1), + nn.BatchNorm3d(128), nn.ReLU()) + + def forward(self, imgs): + # imgs: [batch_size*num_views, 3, T, H, W] + # features: [batch_size*num_views, 128, T/2, H//8, W//8] + features = self.conv(self.maxpool(self.conv1(imgs))) + return features + + +@MODELS.register_module() +class ClsHeadZelda(BaseModule): + def __init__(self, num_classes, in_channels, dropout=0.5, average_clips='prob', init_cfg=None): + if init_cfg is None: + init_cfg = dict(type='Normal', layer='Linear', std=0.01) + + super(ClsHeadZelda, self).__init__(init_cfg=init_cfg) + + self.num_classes = num_classes + self.in_channels = in_channels + self.average_clips = average_clips + + if dropout != 0: + self.dropout = nn.Dropout(dropout) + else: + self.dropout = None + + self.fc = nn.Linear(self.in_channels, self.num_classes) + self.pool = nn.AdaptiveAvgPool3d(1) + self.loss_fn = nn.CrossEntropyLoss() + + def forward(self, x): + N, C, T, H, W = x.shape + x = self.pool(x) + x = x.view(N, C) + assert x.shape[1] == self.in_channels + + if self.dropout is not None: + x = self.dropout(x) + + cls_scores = self.fc(x) + return cls_scores + + def loss(self, feats, data_samples): + cls_scores = self(feats) + labels = torch.stack([x.gt_labels.item for x in data_samples]) + labels = labels.squeeze() + + if labels.shape == torch.Size([]): + labels = labels.unsqueeze(0) + + loss_cls = self.loss_fn(cls_scores, labels) + return dict(loss_cls=loss_cls) + + def predict(self, feats, data_samples): + cls_scores = self(feats) + num_views = cls_scores.shape[0] // len(data_samples) + # assert num_views == data_samples[0].num_clips + cls_scores = self.average_clip(cls_scores, num_views) + + for ds, sc in zip(data_samples, cls_scores): + pred = LabelData(item=sc) + ds.pred_scores = pred + return data_samples + + def average_clip(self, cls_scores, num_views): + if self.average_clips not in ['score', 'prob', None]: + raise ValueError(f'{self.average_clips} is not supported. ' + f'Currently supported ones are ' + f'["score", "prob", None]') + + total_views = cls_scores.shape[0] + cls_scores = cls_scores.view(total_views // num_views, num_views, -1) + + if self.average_clips is None: + return cls_scores + elif self.average_clips == 'prob': + cls_scores = F.softmax(cls_scores, dim=2).mean(dim=1) + elif self.average_clips == 'score': + cls_scores = cls_scores.mean(dim=1) + + return cls_scores + + +@MODELS.register_module() +class RecognizerZelda(BaseModel): + def __init__(self, backbone, cls_head, data_preprocessor): + super().__init__(data_preprocessor=data_preprocessor) + + self.backbone = MODELS.build(backbone) + self.cls_head = MODELS.build(cls_head) + + def extract_feat(self, inputs): + inputs = inputs.view((-1, ) + inputs.shape[2:]) + return self.backbone(inputs) + + def loss(self, inputs, data_samples): + feats = self.extract_feat(inputs) + loss = self.cls_head.loss(feats, data_samples) + return loss + + def predict(self, inputs, data_samples): + feats = self.extract_feat(inputs) + predictions = self.cls_head.predict(feats, data_samples) + return predictions + + def forward(self, inputs, data_samples=None, mode='tensor'): + if mode == 'tensor': + return self.extract_feat(inputs) + elif mode == 'loss': + return self.loss(inputs, data_samples) + elif mode == 'predict': + return self.predict(inputs, data_samples) + else: + raise RuntimeError(f'Invalid mode: {mode}') +``` + +The `init_cfg` is used for model weight initialization. For more information on model weight initialization, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/initialize.html). The usage of the above modules is as follows: + +```python +import torch +import copy +from mmaction.registry import MODELS + +model_cfg = dict( + type='RecognizerZelda', + backbone=dict(type='BackBoneZelda'), + cls_head=dict( + type='ClsHeadZelda', + num_classes=2, + in_channels=128, + average_clips='prob'), + data_preprocessor = dict( + type='DataPreprocessorZelda', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375])) + +model = MODELS.build(model_cfg) + +# Train +model.train() +model.init_weights() +data_batch_train = copy.deepcopy(batched_packed_results) +data = model.data_preprocessor(data_batch_train, training=True) +loss = model(**data, mode='loss') +print('loss dict: ', loss) + +# Test +with torch.no_grad(): + model.eval() + data_batch_test = copy.deepcopy(batched_packed_results) + data = model.data_preprocessor(data_batch_test, training=False) + predictions = model(**data, mode='predict') +print('Label of Sample[0]', predictions[0].gt_labels.item) +print('Scores of Sample[0]', predictions[0].pred_scores.item) +``` + +```shell +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.0.weight - torch.Size([64, 3, 3, 7, 7]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.0.bias - torch.Size([64]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.1.weight - torch.Size([64]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +backbone.conv1.1.bias - torch.Size([64]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.0.weight - torch.Size([128, 64, 3, 3, 3]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.0.bias - torch.Size([128]): +KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0 + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.1.weight - torch.Size([128]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +backbone.conv.1.bias - torch.Size([128]): +The value is the same before and after calling `init_weights` of RecognizerZelda + +04/03 23:28:01 - mmengine - INFO - +cls_head.fc.weight - torch.Size([2, 128]): +NormalInit: mean=0, std=0.01, bias=0 + +04/03 23:28:01 - mmengine - INFO - +cls_head.fc.bias - torch.Size([2]): +NormalInit: mean=0, std=0.01, bias=0 + +loss dict: {'loss_cls': tensor(0.6853, grad_fn=)} +Label of Sample[0] tensor([0]) +Scores of Sample[0] tensor([0.5240, 0.4760]) +``` + +## Step4: Build a Evaluation Metric + +Note that all `Metric` classes in `OpenMMLab` must inherit from the `BaseMetric` class in `mmengine` and implement the abstract methods, `process` and `compute_metrics`. For more information on evaluation, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html). + +```python +import copy +from collections import OrderedDict +from mmengine.evaluator import BaseMetric +from mmaction.evaluation import top_k_accuracy +from mmaction.registry import METRICS + + +@METRICS.register_module() +class AccuracyMetric(BaseMetric): + def __init__(self, topk=(1, 5), collect_device='cpu', prefix='acc'): + super().__init__(collect_device=collect_device, prefix=prefix) + self.topk = topk + + def process(self, data_batch, data_samples): + data_samples = copy.deepcopy(data_samples) + for data_sample in data_samples: + result = dict() + scores = data_sample['pred_scores']['item'].cpu().numpy() + label = data_sample['gt_labels']['item'].item() + result['scores'] = scores + result['label'] = label + self.results.append(result) + + def compute_metrics(self, results: list) -> dict: + eval_results = OrderedDict() + labels = [res['label'] for res in results] + scores = [res['scores'] for res in results] + topk_acc = top_k_accuracy(scores, labels, self.topk) + for k, acc in zip(self.topk, topk_acc): + eval_results[f'topk{k}'] = acc + return eval_results +``` + +```python +from mmaction.registry import METRICS + +metric_cfg = dict(type='AccuracyMetric', topk=(1, 5)) + +metric = METRICS.build(metric_cfg) + +data_samples = [d.to_dict() for d in predictions] + +metric.process(batched_packed_results, data_samples) +acc = metric.compute_metrics(metric.results) +print(acc) +``` + +```shell +OrderedDict([('topk1', 0.5), ('topk5', 1.0)]) +``` + +## Step5: Train and Test with Native PyTorch + +```python +import torch.optim as optim +from mmengine import track_iter_progress + + +device = 'cuda' # or 'cpu' +max_epochs = 10 + +optimizer = optim.Adam(model.parameters(), lr=0.01) + +for epoch in range(max_epochs): + model.train() + losses = [] + for data_batch in track_iter_progress(train_data_loader): + data = model.data_preprocessor(data_batch, training=True) + loss_dict = model(**data, mode='loss') + loss = loss_dict['loss_cls'] + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + losses.append(loss.item()) + + print(f'Epoch[{epoch}]: loss ', sum(losses) / len(train_data_loader)) + + with torch.no_grad(): + model.eval() + for data_batch in track_iter_progress(val_data_loader): + data = model.data_preprocessor(data_batch, training=False) + predictions = model(**data, mode='predict') + data_samples = [d.to_dict() for d in predictions] + metric.process(data_batch, data_samples) + + acc = metric.acc = metric.compute_metrics(metric.results) + for name, topk in acc.items(): + print(f'{name}: ', topk) +``` + +## Step6: Train and Test with MMEngine (Recommended) + +For more details on training and testing, you can refer to [MMAction2 Tutorial](https://mmaction2.readthedocs.io/en/1.x/user_guides/4_train_test.html). For more information on `Runner`, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html). + +```python +from mmengine.runner import Runner + +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1) +val_cfg = dict(type='ValLoop') + +optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.01)) + +runner = Runner(model=model_cfg, work_dir='./work_dirs/guide', + train_dataloader=train_dataloader_cfg, + train_cfg=train_cfg, + val_dataloader=val_dataloader_cfg, + val_cfg=val_cfg, + optim_wrapper=optim_wrapper, + val_evaluator=[metric_cfg], + default_scope='mmaction') +runner.train() +``` diff --git a/docs/en/index.rst b/docs/en/index.rst index 59e3e49b53..392b64ef45 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -8,6 +8,7 @@ You can switch between Chinese and English documents in the lower-left corner of :caption: Get Started get_started.md + guide_to_framework.md .. toctree:: :maxdepth: 1 From 7754e85d95a1994eec44c1e9f3a0fb0291778e26 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Thu, 6 Apr 2023 11:36:41 +0800 Subject: [PATCH 26/36] [fix] fix channel order when show video (#2308) --- mmaction/registry.py | 5 +++++ mmaction/visualization/action_visualizer.py | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/mmaction/registry.py b/mmaction/registry.py index 28d237daa8..6d7d831db1 100644 --- a/mmaction/registry.py +++ b/mmaction/registry.py @@ -9,6 +9,7 @@ from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS from mmengine.registry import DATASETS as MMENGINE_DATASETS from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR +from mmengine.registry import FUNCTIONS as MMENGINE_FUNCTION from mmengine.registry import HOOKS as MMENGINE_HOOKS from mmengine.registry import INFERENCERS as MMENGINE_INFERENCERS from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS @@ -127,3 +128,7 @@ 'inferencer', parent=MMENGINE_INFERENCERS, locations=['mmaction.apis.inferencers']) + +# manage function +FUNCTION = Registry( + 'function', parent=MMENGINE_FUNCTION, locations=['mmaction.mmengine']) diff --git a/mmaction/visualization/action_visualizer.py b/mmaction/visualization/action_visualizer.py index 48c595fd5b..6fc5ae2123 100644 --- a/mmaction/visualization/action_visualizer.py +++ b/mmaction/visualization/action_visualizer.py @@ -268,7 +268,10 @@ def add_datasample(self, wait_time = frame_wait_time else: wait_time = wait_time - self.show(drawn_img, win_name=frame_name, wait_time=wait_time) + self.show( + drawn_img[:, :, ::-1], + win_name=frame_name, + wait_time=wait_time) resulted_video = np.array(resulted_video) if out_path is not None: From d8decfe78034174d78f5edf688ee0ac349abb3ee Mon Sep 17 00:00:00 2001 From: Kai Hu Date: Wed, 5 Apr 2023 23:37:26 -0400 Subject: [PATCH 27/36] [Refactor] speed up LFB training (#2294) --- configs/detection/lfb/README.md | 15 ++- configs/detection/lfb/metafile.yml | 2 +- .../lfb/slowonly-lfb-infer_r50_ava21-rgb.py | 114 ++++++++++++++++++ ...etrained-r50_8xb12-4x16x1-20e_ava21-rgb.py | 59 ++++++++- mmaction/models/roi_heads/shared_heads/lfb.py | 24 ++-- 5 files changed, 190 insertions(+), 24 deletions(-) create mode 100644 configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py diff --git a/configs/detection/lfb/README.md b/configs/detection/lfb/README.md index 1d33a7d7e9..dabb3a1b46 100644 --- a/configs/detection/lfb/README.md +++ b/configs/detection/lfb/README.md @@ -22,7 +22,7 @@ To understand the world, we humans constantly need to relate the present to the | frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | gpu_mem(M) | config | ckpt | log | | :---------------------: | :--------: | :--: | :----------------------------------: | :----------: | :---: | :--------: | :---------------------------------: | :-------------------------------: | :------------------------------: | -| 4x16x1 | raw | 8 | SlowOnly ResNet50 (with Nonlocal LFB) | Kinetics-400 | 24.05 | 8620 | [config](/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4c5b9f25.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log) | +| 4x16x1 | raw | 8 | SlowOnly ResNet50 (with Nonlocal LFB) | Kinetics-400 | 24.11 | 8620 | [config](/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4c5b9f25.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log) | | 4x16x1 | raw | 8 | SlowOnly ResNet50 (with Max LFB) | Kinetics-400 | 22.15 | 8425 | [config](/configs/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4963135b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log) | Note: @@ -33,8 +33,7 @@ Note: 2. We use `slowonly_r50_4x16x1` instead of `I3D-R50-NL` in the original paper as the backbone of LFB, but we have achieved the similar improvement: (ours: 20.1 -> 24.05 vs. author: 22.1 -> 25.8). 3. Because the long-term features are randomly sampled in testing, the test accuracy may have some differences. 4. Before train or test lfb, you need to infer feature bank with the [slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py](/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py). For more details on infer feature bank, you can refer to [Train](#Train) part. -5. You can also dowonload long-term feature bank from [AVA_train_val_float32_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float32_lfb.rar) or [AVA_train_val_float16_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float16_lfb.rar), and then put them on `lfb_prefix_path`. -6. The ROIHead now supports single-label classification (i.e. the network outputs at most +5. The ROIHead now supports single-label classification (i.e. the network outputs at most one-label per actor). This can be done by (a) setting multilabel=False during training and the test_cfg.rcnn.action_thr for testing. @@ -42,7 +41,7 @@ Note: ### a. Infer long-term feature bank for training -Before train or test lfb, you need to infer long-term feature bank first. +Before train or test lfb, you need to infer long-term feature bank first. You can also dowonload long-term feature bank from [AVA_train_val_float32_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float32_lfb.rar) or [AVA_train_val_float16_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float16_lfb.rar), and then put them on `lfb_prefix_path`. In this case, you can skip this step. Specifically, run the test on the training, validation, testing dataset with the config file [slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py](/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py) (The config file will only infer the feature bank of training dataset and you need set `dataset_mode = 'val'` to infer the feature bank of validation dataset in the config file.), and the shared head [LFBInferHead](/mmaction/models/roi_heads/shared_heads/lfb_infer_head.py) will generate the feature bank. @@ -52,12 +51,12 @@ You can use the following command to infer feature bank of AVA training and vali ```shell # set `dataset_mode = 'train'` in lfb_slowonly_r50_ava_infer.py -python tools/test.py slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py \ - checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP +python tools/test.py configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py \ + checkpoints/YOUR_BASELINE_CHECKPOINT.pth # set `dataset_mode = 'val'` in lfb_slowonly_r50_ava_infer.py -python tools/test.py slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py \ - checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP +python tools/test.py configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py \ + checkpoints/YOUR_BASELINE_CHECKPOINT.pth ``` We use [slowonly_r50_4x16x1 checkpoint](https://download.openmmlab.com/mmaction/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb_20201217-40061d5f.pth) from [slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb](/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) to infer feature bank. diff --git a/configs/detection/lfb/metafile.yml b/configs/detection/lfb/metafile.yml index 055032ad18..c1de15768f 100644 --- a/configs/detection/lfb/metafile.yml +++ b/configs/detection/lfb/metafile.yml @@ -22,7 +22,7 @@ Models: - Dataset: AVA v2.1 Task: Action Detection Metrics: - mAP: 24.05 + mAP: 24.11 Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4c5b9f25.pth diff --git a/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py new file mode 100644 index 0000000000..278d87c1e1 --- /dev/null +++ b/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py @@ -0,0 +1,114 @@ +# This config is used to generate long-term feature bank. +_base_ = '../../_base_/default_runtime.py' + +# model settings +lfb_prefix_path = 'data/ava/lfb_half' +dataset_mode = 'train' # ['train', 'val', 'test'] + +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-' + 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics400-rgb_20220901-e7b65fad.pth') + +model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), + roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2048, + num_classes=81, + multilabel=True, + dropout_ratio=0.5), + shared_head=dict( + type='LFBInferHead', + lfb_prefix_path=lfb_prefix_path, + dataset_mode=dataset_mode, + use_half_precision=True)), + data_preprocessor=dict( + type='ActionDataPreprocessor', + _scope_='mmaction', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) + +# dataset settings +dataset_type = 'AVADataset' +data_root = 'data/ava/rawframes' +anno_root = 'data/ava/annotations' + +ann_file_infer = f'{anno_root}/ava_{dataset_mode}_v2.1.csv' + +exclude_file_infer = ( + f'{anno_root}/ava_{dataset_mode}_excluded_timestamps_v2.1.csv') + +label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt' + +proposal_file_infer = ( + f'{anno_root}/ava_dense_proposals_{dataset_mode}.FAIR.recall_93.9.pkl') + +infer_pipeline = [ + dict( + type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='FormatShape', input_format='NCTHW', collapse=True), + dict(type='PackActionInputs') +] + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_infer, + exclude_file=exclude_file_infer, + pipeline=infer_pipeline, + label_file=label_file, + proposal_file=proposal_file_infer, + data_prefix=dict(img=data_root), + person_det_score_thr=0.9, + test_mode=True)) + +test_cfg = dict(type='TestLoop') +test_evaluator = dict( + type='AVAMetric', + ann_file=ann_file_infer, + label_file=label_file, + exclude_file=exclude_file_infer, + action_thr=0.0) diff --git a/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py index 2da2bd3a7c..9d323ad0e4 100644 --- a/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py +++ b/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py @@ -1,6 +1,4 @@ -_base_ = [ - '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py' -] +_base_ = '../../_base_/default_runtime.py' # model settings lfb_prefix_path = 'data/ava/lfb_half' @@ -10,8 +8,39 @@ lfb_channels = 2048 dataset_modes = ('train', 'val') +url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/' + 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-' + 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_' + 'kinetics400-rgb_20220901-e7b65fad.pth') + model = dict( + type='FastRCNN', + _scope_='mmdet', + init_cfg=dict(type='Pretrained', checkpoint=url), + backbone=dict( + type='mmaction.ResNet3dSlowOnly', + depth=50, + pretrained=None, + pretrained2d=False, + lateral=False, + num_stages=4, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + spatial_strides=(1, 2, 2, 1)), roi_head=dict( + type='AVARoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor3D', + roi_layer_type='RoIAlign', + output_size=8, + with_temporal_pool=True), + bbox_head=dict( + type='BBoxHeadAVA', + in_channels=2560, + num_classes=81, + multilabel=True, + dropout_ratio=0.5), shared_head=dict( type='FBOHead', lfb_cfg=dict( @@ -31,8 +60,28 @@ num_non_local_layers=2, st_feat_dropout_ratio=0.2, lt_feat_dropout_ratio=0.2, - pre_activate=True)), - bbox_head=dict(in_channels=2560))) + pre_activate=True))), + data_preprocessor=dict( + type='ActionDataPreprocessor', + _scope_='mmaction', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCTHW'), + train_cfg=dict( + rcnn=dict( + assigner=dict( + type='MaxIoUAssignerAVA', + pos_iou_thr=0.9, + neg_iou_thr=0.9, + min_pos_iou=0.9), + sampler=dict( + type='RandomSampler', + num=32, + pos_fraction=1, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=1.0)), + test_cfg=dict(rcnn=None)) dataset_type = 'AVADataset' data_root = 'data/ava/rawframes' diff --git a/mmaction/models/roi_heads/shared_heads/lfb.py b/mmaction/models/roi_heads/shared_heads/lfb.py index e8e7afff2a..986c784403 100644 --- a/mmaction/models/roi_heads/shared_heads/lfb.py +++ b/mmaction/models/roi_heads/shared_heads/lfb.py @@ -4,7 +4,6 @@ import os.path as osp import warnings -import numpy as np import torch import torch.distributed as dist from mmengine.dist import get_dist_info @@ -130,6 +129,13 @@ def load_lfb(self, map_location): osp.join(self.lfb_prefix_path, f'lfb_{dataset_mode}.pkl')) print(f'Loading LFB from {lfb_path}...') self.lfb.update(torch.load(lfb_path, map_location=map_location)) + + for video_id in self.lfb: + video_features = self.lfb[video_id] + for sec in video_features: + if isinstance(video_features[sec], (list, tuple)): + video_features[sec] = torch.stack(video_features[sec]) + self.lfb[video_id] = video_features print(f'LFB has been loaded on {map_location}.') def load_lfb_on_lmdb(self): @@ -162,22 +168,20 @@ def sample_long_term_features(self, video_id, timestamp): # Sample long term features. window_size, K = self.window_size, self.max_num_sampled_feat start = timestamp - (window_size // 2) - lt_feats = torch.zeros(window_size * K, self.lfb_channels) + lt_feats = torch.zeros(window_size, K, self.lfb_channels) for idx, sec in enumerate(range(start, start + window_size)): if sec in video_features: # `num_feat` is the number of roi features in this second. - num_feat = len(video_features[sec]) - num_feat_sampled = min(num_feat, K) - # Sample some roi features randomly. - random_lfb_indices = np.random.choice( - range(num_feat), num_feat_sampled, replace=False) + feat = video_features[sec] + num_feat = feat.shape[0] - for k, rand_idx in enumerate(random_lfb_indices): - lt_feats[idx * K + k] = video_features[sec][rand_idx] + # Sample some roi features randomly. + random_lfb_indices = torch.randperm(num_feat)[:K] + lt_feats[idx, :num_feat] = feat[random_lfb_indices] # [window_size * max_num_sampled_feat, lfb_channels] - return lt_feats + return lt_feats.reshape(-1, self.lfb_channels) def __getitem__(self, img_key): """Sample long term features like `lfb['0f39OWEqJ24,0902']` where `lfb` From b9aa560875ee126df616caf4c458a339a5a5305b Mon Sep 17 00:00:00 2001 From: Haodong Duan Date: Thu, 6 Apr 2023 11:42:46 +0800 Subject: [PATCH 28/36] [Refactoring] Faster AVA Evaluation using multiprocessing (#2146) --- .../object_detection_evaluation.py | 574 ------------------ .../ava_evaluation/per_image_evaluation.py | 358 ----------- .../ava_evaluation/standard_fields.py | 115 ---- mmaction/evaluation/functional/ava_utils.py | 162 +++-- mmaction/evaluation/metrics/ava_metric.py | 1 + 5 files changed, 111 insertions(+), 1099 deletions(-) delete mode 100644 mmaction/evaluation/functional/ava_evaluation/object_detection_evaluation.py delete mode 100644 mmaction/evaluation/functional/ava_evaluation/per_image_evaluation.py delete mode 100644 mmaction/evaluation/functional/ava_evaluation/standard_fields.py diff --git a/mmaction/evaluation/functional/ava_evaluation/object_detection_evaluation.py b/mmaction/evaluation/functional/ava_evaluation/object_detection_evaluation.py deleted file mode 100644 index 1886521485..0000000000 --- a/mmaction/evaluation/functional/ava_evaluation/object_detection_evaluation.py +++ /dev/null @@ -1,574 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= -"""object_detection_evaluation module. - -ObjectDetectionEvaluation is a class which manages ground truth information of -a object detection dataset, and computes frequently used detection metrics such -as Precision, Recall, CorLoc of the provided detection results. -It supports the following operations: -1) Add ground truth information of images sequentially. -2) Add detection result of images sequentially. -3) Evaluate detection metrics on already inserted detection results. -4) Write evaluation result into a pickle file for future processing or - visualization. - -Note: This module operates on numpy boxes and box lists. -""" - -import collections -import logging -import warnings -from abc import ABCMeta, abstractmethod -from collections import defaultdict - -import numpy as np - -from . import metrics, per_image_evaluation, standard_fields - - -class DetectionEvaluator: - """Interface for object detection evaluation classes. - - Example usage of the Evaluator: - ------------------------------ - evaluator = DetectionEvaluator(categories) - - # Detections and groundtruth for image 1. - evaluator.add_single_groundtruth_image_info(...) - evaluator.add_single_detected_image_info(...) - - # Detections and groundtruth for image 2. - evaluator.add_single_groundtruth_image_info(...) - evaluator.add_single_detected_image_info(...) - - metrics_dict = evaluator.evaluate() - """ - - __metaclass__ = ABCMeta - - def __init__(self, categories): - """Constructor. - - Args: - categories: A list of dicts, each of which has the following keys - - 'id': (required) an integer id uniquely identifying this - category. - 'name': (required) string representing category name e.g., - 'cat', 'dog'. - """ - self._categories = categories - - @abstractmethod - def add_single_ground_truth_image_info(self, image_id, groundtruth_dict): - """Adds groundtruth for a single image to be used for evaluation. - - Args: - image_id: A unique string/integer identifier for the image. - groundtruth_dict: A dictionary of groundtruth numpy arrays required - for evaluations. - """ - - @abstractmethod - def add_single_detected_image_info(self, image_id, detections_dict): - """Adds detections for a single image to be used for evaluation. - - Args: - image_id: A unique string/integer identifier for the image. - detections_dict: A dictionary of detection numpy arrays required - for evaluation. - """ - - @abstractmethod - def evaluate(self): - """Evaluates detections and returns a dictionary of metrics.""" - - @abstractmethod - def clear(self): - """Clears the state to prepare for a fresh evaluation.""" - - -class ObjectDetectionEvaluator(DetectionEvaluator): - """A class to evaluate detections.""" - - def __init__(self, - categories, - matching_iou_threshold=0.5, - evaluate_corlocs=False, - metric_prefix=None, - use_weighted_mean_ap=False, - evaluate_masks=False): - """Constructor. - - Args: - categories: A list of dicts, each of which has the following keys - - 'id': (required) an integer id uniquely identifying this - category. - 'name': (required) string representing category name e.g., - 'cat', 'dog'. - matching_iou_threshold: IOU threshold to use for matching - groundtruth boxes to detection boxes. - evaluate_corlocs: (optional) boolean which determines if corloc - scores are to be returned or not. - metric_prefix: (optional) string prefix for metric name; if None, - no prefix is used. - use_weighted_mean_ap: (optional) boolean which determines if the - mean average precision is computed directly from the scores and - tp_fp_labels of all classes. - evaluate_masks: If False, evaluation will be performed based on - boxes. If True, mask evaluation will be performed instead. - - Raises: - ValueError: If the category ids are not 1-indexed. - """ - super(ObjectDetectionEvaluator, self).__init__(categories) - self._num_classes = max([cat['id'] for cat in categories]) - if min(cat['id'] for cat in categories) < 1: - raise ValueError('Classes should be 1-indexed.') - self._matching_iou_threshold = matching_iou_threshold - self._use_weighted_mean_ap = use_weighted_mean_ap - self._label_id_offset = 1 - self._evaluate_masks = evaluate_masks - self._evaluation = ObjectDetectionEvaluation( - num_groundtruth_classes=self._num_classes, - matching_iou_threshold=self._matching_iou_threshold, - use_weighted_mean_ap=self._use_weighted_mean_ap, - label_id_offset=self._label_id_offset, - ) - self._image_ids = set([]) - self._evaluate_corlocs = evaluate_corlocs - self._metric_prefix = (metric_prefix + '_') if metric_prefix else '' - - def add_single_ground_truth_image_info(self, image_id, groundtruth_dict): - """Adds groundtruth for a single image to be used for evaluation. - - Args: - image_id: A unique string/integer identifier for the image. - groundtruth_dict: A dictionary containing - - standard_fields.InputDataFields.groundtruth_boxes: float32 - numpy array of shape [num_boxes, 4] containing `num_boxes` - groundtruth boxes of the format [ymin, xmin, ymax, xmax] in - absolute image coordinates. - standard_fields.InputDataFields.groundtruth_classes: integer - numpy array of shape [num_boxes] containing 1-indexed - groundtruth classes for the boxes. - standard_fields.InputDataFields.groundtruth_instance_masks: - Optional numpy array of shape [num_boxes, height, width] - with values in {0, 1}. - - Raises: - ValueError: On adding groundtruth for an image more than once. Will - also raise error if instance masks are not in groundtruth - dictionary. - """ - if image_id in self._image_ids: - raise ValueError( - 'Image with id {} already added.'.format(image_id)) - - groundtruth_classes = ( - groundtruth_dict[ - standard_fields.InputDataFields.groundtruth_classes] - - self._label_id_offset) - - groundtruth_masks = None - if self._evaluate_masks: - if (standard_fields.InputDataFields.groundtruth_instance_masks - not in groundtruth_dict): - raise ValueError( - 'Instance masks not in groundtruth dictionary.') - groundtruth_masks = groundtruth_dict[ - standard_fields.InputDataFields.groundtruth_instance_masks] - self._evaluation.add_single_ground_truth_image_info( - image_key=image_id, - groundtruth_boxes=groundtruth_dict[ - standard_fields.InputDataFields.groundtruth_boxes], - groundtruth_class_labels=groundtruth_classes, - groundtruth_masks=groundtruth_masks, - ) - self._image_ids.update([image_id]) - - def add_single_detected_image_info(self, image_id, detections_dict): - """Adds detections for a single image to be used for evaluation. - - Args: - image_id: A unique string/integer identifier for the image. - detections_dict: A dictionary containing - - standard_fields.DetectionResultFields.detection_boxes: float32 - numpy array of shape [num_boxes, 4] containing `num_boxes` - detection boxes of the format [ymin, xmin, ymax, xmax] in - absolute image coordinates. - standard_fields.DetectionResultFields.detection_scores: float32 - numpy array of shape [num_boxes] containing detection - scores for the boxes. - standard_fields.DetectionResultFields.detection_classes: - integer numpy array of shape [num_boxes] containing - 1-indexed detection classes for the boxes. - standard_fields.DetectionResultFields.detection_masks: uint8 - numpy array of shape [num_boxes, height, width] containing - `num_boxes` masks of values ranging between 0 and 1. - - Raises: - ValueError: If detection masks are not in detections dictionary. - """ - detection_classes = ( - detections_dict[ - standard_fields.DetectionResultFields.detection_classes] - - self._label_id_offset) - detection_masks = None - if self._evaluate_masks: - if (standard_fields.DetectionResultFields.detection_masks - not in detections_dict): - raise ValueError( - 'Detection masks not in detections dictionary.') - detection_masks = detections_dict[ - standard_fields.DetectionResultFields.detection_masks] - self._evaluation.add_single_detected_image_info( - image_key=image_id, - detected_boxes=detections_dict[ - standard_fields.DetectionResultFields.detection_boxes], - detected_scores=detections_dict[ - standard_fields.DetectionResultFields.detection_scores], - detected_class_labels=detection_classes, - detected_masks=detection_masks, - ) - - @staticmethod - def create_category_index(categories): - """Creates dictionary of COCO compatible categories keyed by category - id. - - Args: - categories: a list of dicts, each of which has the following keys: - 'id': (required) an integer id uniquely identifying this - category. - 'name': (required) string representing category name - e.g., 'cat', 'dog', 'pizza'. - - Returns: - category_index: a dict containing the same entries as categories, - but keyed by the 'id' field of each category. - """ - category_index = {} - for cat in categories: - category_index[cat['id']] = cat - return category_index - - def evaluate(self): - """Compute evaluation result. - - Returns: - A dictionary of metrics with the following fields - - - 1. summary_metrics: - 'Precision/mAP@IOU': mean average - precision at the specified IOU threshold - - 2. per_category_ap: category specific results with keys of the form - 'PerformanceByCategory/mAP@IOU/category' - """ - (per_class_ap, mean_ap, _, _, per_class_corloc, - mean_corloc) = self._evaluation.evaluate() - - metric = f'mAP@{self._matching_iou_threshold}IOU' - pascal_metrics = {self._metric_prefix + metric: mean_ap} - if self._evaluate_corlocs: - pascal_metrics[self._metric_prefix + - 'Precision/meanCorLoc@{}IOU'.format( - self._matching_iou_threshold)] = mean_corloc - category_index = self.create_category_index(self._categories) - for idx in range(per_class_ap.size): - if idx + self._label_id_offset in category_index: - display_name = ( - self._metric_prefix + - 'PerformanceByCategory/AP@{}IOU/{}'.format( - self._matching_iou_threshold, - category_index[idx + self._label_id_offset]['name'], - )) - pascal_metrics[display_name] = per_class_ap[idx] - - # Optionally add CorLoc metrics.classes - if self._evaluate_corlocs: - display_name = ( - self._metric_prefix + - 'PerformanceByCategory/CorLoc@{}IOU/{}'.format( - self._matching_iou_threshold, - category_index[idx + - self._label_id_offset]['name'], - )) - pascal_metrics[display_name] = per_class_corloc[idx] - - return pascal_metrics - - def clear(self): - """Clears the state to prepare for a fresh evaluation.""" - self._evaluation = ObjectDetectionEvaluation( - num_groundtruth_classes=self._num_classes, - matching_iou_threshold=self._matching_iou_threshold, - use_weighted_mean_ap=self._use_weighted_mean_ap, - label_id_offset=self._label_id_offset, - ) - self._image_ids.clear() - - -class PascalDetectionEvaluator(ObjectDetectionEvaluator): - """A class to evaluate detections using PASCAL metrics.""" - - def __init__(self, categories, matching_iou_threshold=0.5): - super(PascalDetectionEvaluator, self).__init__( - categories, - matching_iou_threshold=matching_iou_threshold, - evaluate_corlocs=False, - use_weighted_mean_ap=False, - ) - - -ObjectDetectionEvalMetrics = collections.namedtuple( - 'ObjectDetectionEvalMetrics', - [ - 'average_precisions', - 'mean_ap', - 'precisions', - 'recalls', - 'corlocs', - 'mean_corloc', - ], -) - - -class ObjectDetectionEvaluation: - """Internal implementation of Pascal object detection metrics.""" - - def __init__(self, - num_groundtruth_classes, - matching_iou_threshold=0.5, - nms_iou_threshold=1.0, - nms_max_output_boxes=10000, - use_weighted_mean_ap=False, - label_id_offset=0): - if num_groundtruth_classes < 1: - raise ValueError( - 'Need at least 1 groundtruth class for evaluation.') - - self.per_image_eval = per_image_evaluation.PerImageEvaluation( - num_groundtruth_classes=num_groundtruth_classes, - matching_iou_threshold=matching_iou_threshold, - ) - self.num_class = num_groundtruth_classes - self.use_weighted_mean_ap = use_weighted_mean_ap - self.label_id_offset = label_id_offset - - self.groundtruth_boxes = {} - self.groundtruth_class_labels = {} - self.groundtruth_masks = {} - self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=int) - self.num_gt_imgs_per_class = np.zeros(self.num_class, dtype=int) - - self._initialize_detections() - - def _initialize_detections(self): - self.detection_keys = set() - self.scores_per_class = [[] for _ in range(self.num_class)] - self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)] - self.num_images_correctly_detected_per_class = np.zeros(self.num_class) - self.average_precision_per_class = np.empty( - self.num_class, dtype=float) - self.average_precision_per_class.fill(np.nan) - self.precisions_per_class = [] - self.recalls_per_class = [] - self.corloc_per_class = np.ones(self.num_class, dtype=float) - - def clear_detections(self): - self._initialize_detections() - - def add_single_ground_truth_image_info(self, - image_key, - groundtruth_boxes, - groundtruth_class_labels, - groundtruth_masks=None): - """Adds groundtruth for a single image to be used for evaluation. - - Args: - image_key: A unique string/integer identifier for the image. - groundtruth_boxes: float32 numpy array of shape [num_boxes, 4] - containing `num_boxes` groundtruth boxes of the format - [ymin, xmin, ymax, xmax] in absolute image coordinates. - groundtruth_class_labels: integer numpy array of shape [num_boxes] - containing 0-indexed groundtruth classes for the boxes. - groundtruth_masks: uint8 numpy array of shape - [num_boxes, height, width] containing `num_boxes` groundtruth - masks. The mask values range from 0 to 1. - """ - if image_key in self.groundtruth_boxes: - warnings.warn(('image %s has already been added to the ground ' - 'truth database.'), image_key) - return - - self.groundtruth_boxes[image_key] = groundtruth_boxes - self.groundtruth_class_labels[image_key] = groundtruth_class_labels - self.groundtruth_masks[image_key] = groundtruth_masks - - self._update_ground_truth_statistics(groundtruth_class_labels) - - def add_single_detected_image_info(self, - image_key, - detected_boxes, - detected_scores, - detected_class_labels, - detected_masks=None): - """Adds detections for a single image to be used for evaluation. - - Args: - image_key: A unique string/integer identifier for the image. - detected_boxes: float32 numpy array of shape [num_boxes, 4] - containing `num_boxes` detection boxes of the format - [ymin, xmin, ymax, xmax] in absolute image coordinates. - detected_scores: float32 numpy array of shape [num_boxes] - containing detection scores for the boxes. - detected_class_labels: integer numpy array of shape [num_boxes] - containing 0-indexed detection classes for the boxes. - detected_masks: np.uint8 numpy array of shape - [num_boxes, height, width] containing `num_boxes` detection - masks with values ranging between 0 and 1. - - Raises: - ValueError: if the number of boxes, scores and class labels differ - in length. - """ - if len(detected_boxes) != len(detected_scores) or len( - detected_boxes) != len(detected_class_labels): - raise ValueError( - 'detected_boxes, detected_scores and ' - 'detected_class_labels should all have same lengths. Got' - '[%d, %d, %d]' % len(detected_boxes), - len(detected_scores), - len(detected_class_labels), - ) - - if image_key in self.detection_keys: - warnings.warn(('image %s has already been added to the ground ' - 'truth database.'), image_key) - return - - self.detection_keys.add(image_key) - if image_key in self.groundtruth_boxes: - groundtruth_boxes = self.groundtruth_boxes[image_key] - groundtruth_class_labels = self.groundtruth_class_labels[image_key] - # Masks are popped instead of look up. The reason is that we do not - # want to keep all masks in memory which can cause memory overflow. - groundtruth_masks = self.groundtruth_masks.pop(image_key) - else: - groundtruth_boxes = np.empty(shape=[0, 4], dtype=float) - groundtruth_class_labels = np.array([], dtype=int) - if detected_masks is None: - groundtruth_masks = None - else: - groundtruth_masks = np.empty(shape=[0, 1, 1], dtype=float) - ( - scores, - tp_fp_labels, - ) = self.per_image_eval.compute_object_detection_metrics( - detected_boxes=detected_boxes, - detected_scores=detected_scores, - detected_class_labels=detected_class_labels, - groundtruth_boxes=groundtruth_boxes, - groundtruth_class_labels=groundtruth_class_labels, - detected_masks=detected_masks, - groundtruth_masks=groundtruth_masks, - ) - - for i in range(self.num_class): - if scores[i].shape[0] > 0: - self.scores_per_class[i].append(scores[i]) - self.tp_fp_labels_per_class[i].append(tp_fp_labels[i]) - - def _update_ground_truth_statistics(self, groundtruth_class_labels): - """Update grouth truth statitistics. - - Args: - groundtruth_class_labels: An integer numpy array of length M, - representing M class labels of object instances in ground truth - """ - count = defaultdict(lambda: 0) - for label in groundtruth_class_labels: - count[label] += 1 - for k in count: - self.num_gt_instances_per_class[k] += count[k] - self.num_gt_imgs_per_class[k] += 1 - - def evaluate(self): - """Compute evaluation result. - - Returns: - A named tuple with the following fields - - average_precision: float numpy array of average precision for - each class. - mean_ap: mean average precision of all classes, float scalar - precisions: List of precisions, each precision is a float numpy - array - recalls: List of recalls, each recall is a float numpy array - corloc: numpy float array - mean_corloc: Mean CorLoc score for each class, float scalar - """ - if (self.num_gt_instances_per_class == 0).any(): - logging.info( - 'The following classes have no ground truth examples: %s', - np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) + - self.label_id_offset) - - if self.use_weighted_mean_ap: - all_scores = np.array([], dtype=float) - all_tp_fp_labels = np.array([], dtype=bool) - - for class_index in range(self.num_class): - if self.num_gt_instances_per_class[class_index] == 0: - continue - if not self.scores_per_class[class_index]: - scores = np.array([], dtype=float) - tp_fp_labels = np.array([], dtype=bool) - else: - scores = np.concatenate(self.scores_per_class[class_index]) - tp_fp_labels = np.concatenate( - self.tp_fp_labels_per_class[class_index]) - if self.use_weighted_mean_ap: - all_scores = np.append(all_scores, scores) - all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels) - precision, recall = metrics.compute_precision_recall( - scores, tp_fp_labels, - self.num_gt_instances_per_class[class_index]) - self.precisions_per_class.append(precision) - self.recalls_per_class.append(recall) - average_precision = metrics.compute_average_precision( - precision, recall) - self.average_precision_per_class[class_index] = average_precision - - self.corloc_per_class = metrics.compute_cor_loc( - self.num_gt_imgs_per_class, - self.num_images_correctly_detected_per_class) - - if self.use_weighted_mean_ap: - num_gt_instances = np.sum(self.num_gt_instances_per_class) - precision, recall = metrics.compute_precision_recall( - all_scores, all_tp_fp_labels, num_gt_instances) - mean_ap = metrics.compute_average_precision(precision, recall) - else: - mean_ap = np.nanmean(self.average_precision_per_class) - mean_corloc = np.nanmean(self.corloc_per_class) - return ObjectDetectionEvalMetrics( - self.average_precision_per_class, - mean_ap, - self.precisions_per_class, - self.recalls_per_class, - self.corloc_per_class, - mean_corloc, - ) diff --git a/mmaction/evaluation/functional/ava_evaluation/per_image_evaluation.py b/mmaction/evaluation/functional/ava_evaluation/per_image_evaluation.py deleted file mode 100644 index 9a6e0d9e40..0000000000 --- a/mmaction/evaluation/functional/ava_evaluation/per_image_evaluation.py +++ /dev/null @@ -1,358 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= -"""Evaluate Object Detection result on a single image. - -Annotate each detected result as true positives or false positive according to -a predefined IOU ratio. Non Maximum Suppression is used by default. Multi class -detection is supported by default. Based on the settings, per image evaluation -is either performed on boxes or on object masks. -""" - -import numpy as np - -from . import np_box_list, np_box_ops - - -class PerImageEvaluation: - """Evaluate detection result of a single image.""" - - def __init__(self, num_groundtruth_classes, matching_iou_threshold=0.5): - """Initialized PerImageEvaluation by evaluation parameters. - - Args: - num_groundtruth_classes: Number of ground truth object classes - matching_iou_threshold: A ratio of area intersection to union, - which is the threshold to consider whether a detection is true - positive or not - """ - self.matching_iou_threshold = matching_iou_threshold - self.num_groundtruth_classes = num_groundtruth_classes - - def compute_object_detection_metrics(self, - detected_boxes, - detected_scores, - detected_class_labels, - groundtruth_boxes, - groundtruth_class_labels, - detected_masks=None, - groundtruth_masks=None): - """Evaluates detections as being tp, fp or ignored from a single image. - - The evaluation is done in two stages: - 1. All detections are matched to non group-of boxes. - - Args: - detected_boxes: A float numpy array of shape [N, 4], representing N - regions of detected object regions. - Each row is of the format [y_min, x_min, y_max, x_max] - detected_scores: A float numpy array of shape [N, 1], representing - the confidence scores of the detected N object instances. - detected_class_labels: A integer numpy array of shape [N, 1], - repreneting the class labels of the detected N object - instances. - groundtruth_boxes: A float numpy array of shape [M, 4], - representing M regions of object instances in ground truth - groundtruth_class_labels: An integer numpy array of shape [M, 1], - representing M class labels of object instances in ground truth - detected_masks: (optional) A uint8 numpy array of shape - [N, height, width]. If not None, the metrics will be computed - based on masks. - groundtruth_masks: (optional) A uint8 numpy array of shape - [M, height, width]. - - Returns: - scores: A list of C float numpy arrays. Each numpy array is of - shape [K, 1], representing K scores detected with object class - label c - tp_fp_labels: A list of C boolean numpy arrays. Each numpy array - is of shape [K, 1], representing K True/False positive label of - object instances detected with class label c - """ - ( - detected_boxes, - detected_scores, - detected_class_labels, - detected_masks, - ) = self._remove_invalid_boxes( - detected_boxes, - detected_scores, - detected_class_labels, - detected_masks, - ) - scores, tp_fp_labels = self._compute_tp_fp( - detected_boxes=detected_boxes, - detected_scores=detected_scores, - detected_class_labels=detected_class_labels, - groundtruth_boxes=groundtruth_boxes, - groundtruth_class_labels=groundtruth_class_labels, - detected_masks=detected_masks, - groundtruth_masks=groundtruth_masks, - ) - - return scores, tp_fp_labels - - def _compute_tp_fp(self, - detected_boxes, - detected_scores, - detected_class_labels, - groundtruth_boxes, - groundtruth_class_labels, - detected_masks=None, - groundtruth_masks=None): - """Labels true/false positives of detections of an image across all - classes. - - Args: - detected_boxes: A float numpy array of shape [N, 4], representing N - regions of detected object regions. - Each row is of the format [y_min, x_min, y_max, x_max] - detected_scores: A float numpy array of shape [N, 1], representing - the confidence scores of the detected N object instances. - detected_class_labels: A integer numpy array of shape [N, 1], - repreneting the class labels of the detected N object - instances. - groundtruth_boxes: A float numpy array of shape [M, 4], - representing M regions of object instances in ground truth - groundtruth_class_labels: An integer numpy array of shape [M, 1], - representing M class labels of object instances in ground truth - detected_masks: (optional) A np.uint8 numpy array of shape - [N, height, width]. If not None, the scores will be computed - based on masks. - groundtruth_masks: (optional) A np.uint8 numpy array of shape - [M, height, width]. - - Returns: - result_scores: A list of float numpy arrays. Each numpy array is of - shape [K, 1], representing K scores detected with object class - label c - result_tp_fp_labels: A list of boolean numpy array. Each numpy - array is of shape [K, 1], representing K True/False positive - label of object instances detected with class label c - - Raises: - ValueError: If detected masks is not None but groundtruth masks are - None, or the other way around. - """ - if detected_masks is not None and groundtruth_masks is None: - raise ValueError( - 'Detected masks is available but groundtruth masks is not.') - if detected_masks is None and groundtruth_masks is not None: - raise ValueError( - 'Groundtruth masks is available but detected masks is not.') - - result_scores = [] - result_tp_fp_labels = [] - for i in range(self.num_groundtruth_classes): - (gt_boxes_at_ith_class, gt_masks_at_ith_class, - detected_boxes_at_ith_class, detected_scores_at_ith_class, - detected_masks_at_ith_class) = self._get_ith_class_arrays( - detected_boxes, detected_scores, detected_masks, - detected_class_labels, groundtruth_boxes, groundtruth_masks, - groundtruth_class_labels, i) - scores, tp_fp_labels = self._compute_tp_fp_for_single_class( - detected_boxes=detected_boxes_at_ith_class, - detected_scores=detected_scores_at_ith_class, - groundtruth_boxes=gt_boxes_at_ith_class, - detected_masks=detected_masks_at_ith_class, - groundtruth_masks=gt_masks_at_ith_class, - ) - result_scores.append(scores) - result_tp_fp_labels.append(tp_fp_labels) - return result_scores, result_tp_fp_labels - - @staticmethod - def _get_overlaps_and_scores_box_mode(detected_boxes, detected_scores, - groundtruth_boxes): - """Computes overlaps and scores between detected and groudntruth boxes. - - Args: - detected_boxes: A numpy array of shape [N, 4] representing detected - box coordinates - detected_scores: A 1-d numpy array of length N representing - classification score - groundtruth_boxes: A numpy array of shape [M, 4] representing - ground truth box coordinates - - Returns: - iou: A float numpy array of size [num_detected_boxes, - num_gt_boxes]. If gt_non_group_of_boxlist.num_boxes() == 0 it - will be None. - ioa: A float numpy array of size [num_detected_boxes, - num_gt_boxes]. If gt_group_of_boxlist.num_boxes() == 0 it will - be None. - scores: The score of the detected boxlist. - num_boxes: Number of non-maximum suppressed detected boxes. - """ - detected_boxlist = np_box_list.BoxList(detected_boxes) - detected_boxlist.add_field('scores', detected_scores) - gt_non_group_of_boxlist = np_box_list.BoxList(groundtruth_boxes) - - iou = np_box_ops.iou(detected_boxlist.get(), - gt_non_group_of_boxlist.get()) - scores = detected_boxlist.get_field('scores') - num_boxes = detected_boxlist.num_boxes() - return iou, None, scores, num_boxes - - def _compute_tp_fp_for_single_class(self, - detected_boxes, - detected_scores, - groundtruth_boxes, - detected_masks=None, - groundtruth_masks=None): - """Labels boxes detected with the same class from the same image as - tp/fp. - - Args: - detected_boxes: A numpy array of shape [N, 4] representing detected - box coordinates - detected_scores: A 1-d numpy array of length N representing - classification score - groundtruth_boxes: A numpy array of shape [M, 4] representing - groundtruth box coordinates - detected_masks: (optional) A uint8 numpy array of shape - [N, height, width]. If not None, the scores will be computed - based on masks. - groundtruth_masks: (optional) A uint8 numpy array of shape - [M, height, width]. - - Returns: - Two arrays of the same size, containing all boxes that were - evaluated as being true positives or false positives. - - scores: A numpy array representing the detection scores. - tp_fp_labels: a boolean numpy array indicating whether a detection - is a true positive. - """ - if detected_boxes.size == 0: - return np.array([], dtype=float), np.array([], dtype=bool) - - (iou, _, scores, - num_detected_boxes) = self._get_overlaps_and_scores_box_mode( - detected_boxes=detected_boxes, - detected_scores=detected_scores, - groundtruth_boxes=groundtruth_boxes) - - if groundtruth_boxes.size == 0: - return scores, np.zeros(num_detected_boxes, dtype=bool) - - tp_fp_labels = np.zeros(num_detected_boxes, dtype=bool) - - # The evaluation is done in two stages: - # 1. All detections are matched to non group-of boxes. - # 2. Detections that are determined as false positives are matched - # against group-of boxes and ignored if matched. - - # Tp-fp evaluation for non-group of boxes (if any). - if iou.shape[1] > 0: - max_overlap_gt_ids = np.argmax(iou, axis=1) - is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool) - for i in range(num_detected_boxes): - gt_id = max_overlap_gt_ids[i] - if iou[i, gt_id] >= self.matching_iou_threshold: - if not is_gt_box_detected[gt_id]: - tp_fp_labels[i] = True - is_gt_box_detected[gt_id] = True - - return scores, tp_fp_labels - - @staticmethod - def _get_ith_class_arrays(detected_boxes, detected_scores, detected_masks, - detected_class_labels, groundtruth_boxes, - groundtruth_masks, groundtruth_class_labels, - class_index): - """Returns numpy arrays belonging to class with index `class_index`. - - Args: - detected_boxes: A numpy array containing detected boxes. - detected_scores: A numpy array containing detected scores. - detected_masks: A numpy array containing detected masks. - detected_class_labels: A numpy array containing detected class - labels. - groundtruth_boxes: A numpy array containing groundtruth boxes. - groundtruth_masks: A numpy array containing groundtruth masks. - groundtruth_class_labels: A numpy array containing groundtruth - class labels. - class_index: An integer index. - - Returns: - gt_boxes_at_ith_class: A numpy array containing groundtruth boxes - labeled as ith class. - gt_masks_at_ith_class: A numpy array containing groundtruth masks - labeled as ith class. - detected_boxes_at_ith_class: A numpy array containing detected - boxes corresponding to the ith class. - detected_scores_at_ith_class: A numpy array containing detected - scores corresponding to the ith class. - detected_masks_at_ith_class: A numpy array containing detected - masks corresponding to the ith class. - """ - selected_groundtruth = groundtruth_class_labels == class_index - gt_boxes_at_ith_class = groundtruth_boxes[selected_groundtruth] - if groundtruth_masks is not None: - gt_masks_at_ith_class = groundtruth_masks[selected_groundtruth] - else: - gt_masks_at_ith_class = None - selected_detections = detected_class_labels == class_index - detected_boxes_at_ith_class = detected_boxes[selected_detections] - detected_scores_at_ith_class = detected_scores[selected_detections] - if detected_masks is not None: - detected_masks_at_ith_class = detected_masks[selected_detections] - else: - detected_masks_at_ith_class = None - return (gt_boxes_at_ith_class, gt_masks_at_ith_class, - detected_boxes_at_ith_class, detected_scores_at_ith_class, - detected_masks_at_ith_class) - - @staticmethod - def _remove_invalid_boxes(detected_boxes, - detected_scores, - detected_class_labels, - detected_masks=None): - """Removes entries with invalid boxes. - - A box is invalid if either its xmax is smaller than its xmin, or its - ymax is smaller than its ymin. - - Args: - detected_boxes: A float numpy array of size [num_boxes, 4] - containing box coordinates in [ymin, xmin, ymax, xmax] format. - detected_scores: A float numpy array of size [num_boxes]. - detected_class_labels: A int32 numpy array of size [num_boxes]. - detected_masks: A uint8 numpy array of size - [num_boxes, height, width]. - - Returns: - valid_detected_boxes: A float numpy array of size - [num_valid_boxes, 4] containing box coordinates in - [ymin, xmin, ymax, xmax] format. - valid_detected_scores: A float numpy array of size - [num_valid_boxes]. - valid_detected_class_labels: A int32 numpy array of size - [num_valid_boxes]. - valid_detected_masks: A uint8 numpy array of size - [num_valid_boxes, height, width]. - """ - valid_indices = np.logical_and( - detected_boxes[:, 0] < detected_boxes[:, 2], - detected_boxes[:, 1] < detected_boxes[:, 3]) - detected_boxes = detected_boxes[valid_indices] - detected_scores = detected_scores[valid_indices] - detected_class_labels = detected_class_labels[valid_indices] - if detected_masks is not None: - detected_masks = detected_masks[valid_indices] - return [ - detected_boxes, detected_scores, detected_class_labels, - detected_masks - ] diff --git a/mmaction/evaluation/functional/ava_evaluation/standard_fields.py b/mmaction/evaluation/functional/ava_evaluation/standard_fields.py deleted file mode 100644 index 8edf46d081..0000000000 --- a/mmaction/evaluation/functional/ava_evaluation/standard_fields.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================= -"""Contains classes specifying naming conventions used for object detection. - -Specifies: - InputDataFields: standard fields used by reader/preprocessor/batcher. - DetectionResultFields: standard fields returned by object detector. -""" - - -class InputDataFields: - """Names for the input tensors. - - Holds the standard data field names to use for identifying input tensors. - This should be used by the decoder to identify keys for the returned - tensor_dict containing input tensors. And it should be used by the model to - identify the tensors it needs. - - Attributes: - image: image. - original_image: image in the original input size. - key: unique key corresponding to image. - source_id: source of the original image. - filename: original filename of the dataset (without common path). - groundtruth_image_classes: image-level class labels. - groundtruth_boxes: coordinates of the ground truth boxes in the image. - groundtruth_classes: box-level class labels. - groundtruth_label_types: box-level label types (e.g. explicit - negative). - groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead] - is the groundtruth a single object or a crowd. - groundtruth_area: area of a groundtruth segment. - groundtruth_difficult: is a `difficult` object - groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of - the same class, forming a connected group, where instances are - heavily occluding each other. - proposal_boxes: coordinates of object proposal boxes. - proposal_objectness: objectness score of each proposal. - groundtruth_instance_masks: ground truth instance masks. - groundtruth_instance_boundaries: ground truth instance boundaries. - groundtruth_instance_classes: instance mask-level class labels. - groundtruth_keypoints: ground truth keypoints. - groundtruth_keypoint_visibilities: ground truth keypoint visibilities. - groundtruth_label_scores: groundtruth label scores. - groundtruth_weights: groundtruth weight factor for bounding boxes. - num_groundtruth_boxes: number of groundtruth boxes. - true_image_shapes: true shapes of images in the resized images, as - resized images can be padded with zeros. - """ - - image = 'image' - original_image = 'original_image' - key = 'key' - source_id = 'source_id' - filename = 'filename' - groundtruth_image_classes = 'groundtruth_image_classes' - groundtruth_boxes = 'groundtruth_boxes' - groundtruth_classes = 'groundtruth_classes' - groundtruth_label_types = 'groundtruth_label_types' - groundtruth_is_crowd = 'groundtruth_is_crowd' - groundtruth_area = 'groundtruth_area' - groundtruth_difficult = 'groundtruth_difficult' - groundtruth_group_of = 'groundtruth_group_of' - proposal_boxes = 'proposal_boxes' - proposal_objectness = 'proposal_objectness' - groundtruth_instance_masks = 'groundtruth_instance_masks' - groundtruth_instance_boundaries = 'groundtruth_instance_boundaries' - groundtruth_instance_classes = 'groundtruth_instance_classes' - groundtruth_keypoints = 'groundtruth_keypoints' - groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities' - groundtruth_label_scores = 'groundtruth_label_scores' - groundtruth_weights = 'groundtruth_weights' - num_groundtruth_boxes = 'num_groundtruth_boxes' - true_image_shape = 'true_image_shape' - - -class DetectionResultFields: - """Naming conventions for storing the output of the detector. - - Attributes: - source_id: source of the original image. - key: unique key corresponding to image. - detection_boxes: coordinates of the detection boxes in the image. - detection_scores: detection scores for the detection boxes in the - image. - detection_classes: detection-level class labels. - detection_masks: contains a segmentation mask for each detection box. - detection_boundaries: contains an object boundary for each detection - box. - detection_keypoints: contains detection keypoints for each detection - box. - num_detections: number of detections in the batch. - """ - - source_id = 'source_id' - key = 'key' - detection_boxes = 'detection_boxes' - detection_scores = 'detection_scores' - detection_classes = 'detection_classes' - detection_masks = 'detection_masks' - detection_boundaries = 'detection_boundaries' - detection_keypoints = 'detection_keypoints' - num_detections = 'num_detections' diff --git a/mmaction/evaluation/functional/ava_utils.py b/mmaction/evaluation/functional/ava_utils.py index cb739a4a9b..c15737632c 100644 --- a/mmaction/evaluation/functional/ava_utils.py +++ b/mmaction/evaluation/functional/ava_utils.py @@ -3,14 +3,13 @@ # https://github.com/activitynet/ActivityNet/blob/master/ # Evaluation/get_ava_performance.py. Some unused codes are removed. import csv -import logging +import multiprocessing import time from collections import defaultdict import numpy as np -from .ava_evaluation import object_detection_evaluation as det_eval -from .ava_evaluation import standard_fields +from .ava_evaluation import metrics, np_box_list, np_box_ops def det2csv(results, custom_classes): @@ -42,7 +41,7 @@ def results2csv(results, out_file, custom_classes=None): # save space for float def to_str(item): if isinstance(item, float): - return f'{item:.3f}' + return f'{item:.4f}' return str(item) with open(out_file, 'w') as f: @@ -80,7 +79,6 @@ def read_csv(csv_file, class_whitelist=None): of score values labels, matching the corresponding label in `labels`. If scores are not provided in the csv, then they will default to 1.0. """ - start = time.time() entries = defaultdict(list) boxes = defaultdict(list) labels = defaultdict(list) @@ -107,7 +105,6 @@ def read_csv(csv_file, class_whitelist=None): labels[image_key] = [x[1] for x in entry] scores[image_key] = [x[0] for x in entry] - print_time('read file ' + csv_file.name, start) return boxes, labels, scores @@ -157,6 +154,51 @@ def read_labelmap(labelmap_file): return labelmap, class_ids +def get_overlaps_and_scores_box_mode(detected_boxes, detected_scores, + groundtruth_boxes): + + detected_boxlist = np_box_list.BoxList(detected_boxes) + detected_boxlist.add_field('scores', detected_scores) + gt_non_group_of_boxlist = np_box_list.BoxList(groundtruth_boxes) + + iou = np_box_ops.iou(detected_boxlist.get(), gt_non_group_of_boxlist.get()) + scores = detected_boxlist.get_field('scores') + num_boxes = detected_boxlist.num_boxes() + return iou, scores, num_boxes + + +def tpfp_single(tup, threshold=0.5): + gt_bboxes, gt_labels, bboxes, labels, scores = tup + ret_scores, ret_tp_fp_labels = dict(), dict() + all_labels = list(set(labels)) + for label in all_labels: + gt_bbox = np.array( + [x for x, y in zip(gt_bboxes, gt_labels) if y == label], + dtype=np.float32).reshape(-1, 4) + bbox = np.array([x for x, y in zip(bboxes, labels) if y == label], + dtype=np.float32).reshape(-1, 4) + score = np.array([x for x, y in zip(scores, labels) if y == label], + dtype=np.float32).reshape(-1) + iou, score, num_boxes = get_overlaps_and_scores_box_mode( + bbox, score, gt_bbox) + if gt_bbox.size == 0: + ret_scores[label] = score + ret_tp_fp_labels[label] = np.zeros(num_boxes, dtype=bool) + continue + tp_fp_labels = np.zeros(num_boxes, dtype=bool) + if iou.shape[1] > 0: + max_overlap_gt_ids = np.argmax(iou, axis=1) + is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool) + for i in range(num_boxes): + gt_id = max_overlap_gt_ids[i] + if iou[i, gt_id] >= threshold: + if not is_gt_box_detected[gt_id]: + tp_fp_labels[i] = True + is_gt_box_detected[gt_id] = True + ret_scores[label], ret_tp_fp_labels[label] = score, tp_fp_labels + return ret_scores, ret_tp_fp_labels + + # Seems there is at most 100 detections for each image def ava_eval(result_file, result_type, @@ -164,10 +206,11 @@ def ava_eval(result_file, ann_file, exclude_file, verbose=True, + ignore_empty_frames=True, custom_classes=None): """Perform ava evaluation.""" - assert result_type in ['mAP'] + assert result_type in ['mAP'] start = time.time() categories, class_whitelist = read_labelmap(open(label_file)) if custom_classes is not None: @@ -177,9 +220,9 @@ def ava_eval(result_file, categories = [cat for cat in categories if cat['id'] in custom_classes] # loading gt, do not need gt score - gt_boxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist) + gt_bboxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist) if verbose: - print_time('Reading detection results', start) + print_time('Reading GT results', start) if exclude_file is not None: excluded_keys = read_exclusions(open(exclude_file)) @@ -189,54 +232,69 @@ def ava_eval(result_file, start = time.time() boxes, labels, scores = read_csv(open(result_file), class_whitelist) if verbose: - print_time('Reading detection results', start) - - # Evaluation for mAP - pascal_evaluator = det_eval.PascalDetectionEvaluator(categories) + print_time('Reading Detection results', start) start = time.time() - for image_key in gt_boxes: - if verbose and image_key in excluded_keys: - logging.info( - 'Found excluded timestamp in detections: %s.' - 'It will be ignored.', image_key) - continue - pascal_evaluator.add_single_ground_truth_image_info( - image_key, { - standard_fields.InputDataFields.groundtruth_boxes: - np.array(gt_boxes[image_key], dtype=float), - standard_fields.InputDataFields.groundtruth_classes: - np.array(gt_labels[image_key], dtype=int) - }) + all_gt_labels = np.concatenate(list(gt_labels.values())) + gt_count = {k: np.sum(all_gt_labels == k) for k in class_whitelist} + + pool = multiprocessing.Pool(32) + if ignore_empty_frames: + tups = [(gt_bboxes[k], gt_labels[k], boxes[k], labels[k], scores[k]) + for k in gt_bboxes if k not in excluded_keys] + else: + tups = [(gt_bboxes.get(k, np.zeros((0, 4), dtype=np.float32)), + gt_labels.get(k, []), boxes[k], labels[k], scores[k]) + for k in boxes if k not in excluded_keys] + rets = pool.map(tpfp_single, tups) + if verbose: - print_time('Convert groundtruth', start) + print_time('Calculating TP/FP', start) start = time.time() - for image_key in boxes: - if verbose and image_key in excluded_keys: - logging.info( - 'Found excluded timestamp in detections: %s.' - 'It will be ignored.', image_key) - continue - pascal_evaluator.add_single_detected_image_info( - image_key, { - standard_fields.DetectionResultFields.detection_boxes: - np.array(boxes[image_key], dtype=float), - standard_fields.DetectionResultFields.detection_classes: - np.array(labels[image_key], dtype=int), - standard_fields.DetectionResultFields.detection_scores: - np.array(scores[image_key], dtype=float) - }) + scores, tpfps = defaultdict(list), defaultdict(list) + for score, tpfp in rets: + for k in score: + scores[k].append(score[k]) + tpfps[k].append(tpfp[k]) + + cls_AP = [] + for k in scores: + scores[k] = np.concatenate(scores[k]) + tpfps[k] = np.concatenate(tpfps[k]) + precision, recall = metrics.compute_precision_recall( + scores[k], tpfps[k], gt_count[k]) + ap = metrics.compute_average_precision(precision, recall) + class_name = [x['name'] for x in categories if x['id'] == k] + assert len(class_name) == 1 + class_name = class_name[0] + cls_AP.append((k, class_name, ap)) if verbose: - print_time('convert detections', start) + print_time('Run Evaluator', start) + + print('Per-class results: ', flush=True) + for k, class_name, ap in cls_AP: + print(f'Index: {k}, Action: {class_name}: AP: {ap:.4f};', flush=True) + + overall = np.nanmean([x[2] for x in cls_AP]) + person_movement = np.nanmean([x[2] for x in cls_AP if x[0] <= 14]) + object_manipulation = np.nanmean([x[2] for x in cls_AP if 14 < x[0] < 64]) + person_interaction = np.nanmean([x[2] for x in cls_AP if 64 <= x[0]]) + + print('Overall Results: ', flush=True) + print(f'Overall mAP: {overall:.4f}', flush=True) + print(f'Person Movement mAP: {person_movement:.4f}', flush=True) + print(f'Object Manipulation mAP: {object_manipulation:.4f}', flush=True) + print(f'Person Interaction mAP: {person_interaction:.4f}', flush=True) + + results = {} + results['overall'] = overall + results['person_movement'] = person_movement + results['object_manipulation'] = object_manipulation + results['person_interaction'] = person_interaction - start = time.time() - metrics = pascal_evaluator.evaluate() if verbose: - print_time('run_evaluator', start) - for display_name in metrics: - print(f'{display_name}=\t{metrics[display_name]}') - return { - display_name: metrics[display_name] - for display_name in metrics if 'ByCategory' not in display_name - } + for k, class_name, ap in cls_AP: + print(f'Class {class_name} AP: {ap:.4f}', flush=True) + + return results diff --git a/mmaction/evaluation/metrics/ava_metric.py b/mmaction/evaluation/metrics/ava_metric.py index 66e8fdcc4a..76cc83e6c5 100644 --- a/mmaction/evaluation/metrics/ava_metric.py +++ b/mmaction/evaluation/metrics/ava_metric.py @@ -81,6 +81,7 @@ def compute_metrics(self, results: list) -> dict: self.label_file, self.ann_file, self.exclude_file, + ignore_empty_frames=True, custom_classes=self.custom_classes) os.remove(temp_file) From d31224809e3df0b54170a0e60309369dbe9e7953 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Thu, 6 Apr 2023 14:46:56 +0800 Subject: [PATCH 29/36] [Fix] fix flip config of sthsth dataset (#2247) --- configs/recognition/tpn/README.md | 2 +- configs/recognition/tpn/metafile.yml | 6 ++-- ...retrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py | 4 ++- configs/recognition/tsm/README.md | 10 +++--- configs/recognition/tsm/metafile.yml | 34 +++++++++---------- ...etrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py | 2 +- ...etrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py | 5 +-- ...retrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py | 5 +-- configs/recognition/tsn/README.md | 6 ++-- configs/recognition/tsn/metafile.yml | 12 +++---- ...etrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py | 3 +- ...retrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py | 3 +- 12 files changed, 49 insertions(+), 43 deletions(-) diff --git a/configs/recognition/tpn/README.md b/configs/recognition/tpn/README.md index 972dbcbc7b..20a488ccb1 100644 --- a/configs/recognition/tpn/README.md +++ b/configs/recognition/tpn/README.md @@ -29,7 +29,7 @@ Visual tempo characterizes the dynamics and the temporal scale of an action. Mod | frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | | :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----------------: | :--------------: | :---------------------: | :--------: | :---------------: | :-------------: | :------------: | -| 1x1x8 | height 100 | 8x6 | ResNet50 | TSM | 48.98 | 78.91 | x | x | 8 clips x 3 crop | x | 8828 | [config](/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20220913-d2f5c300.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log) | +| 1x1x8 | height 100 | 8x6 | ResNet50 | TSM | 51.87 | 79.67 | x | x | 8 clips x 3 crop | x | 8828 | [config](/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20230221-940a3615.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log) | :::{note} diff --git a/configs/recognition/tpn/metafile.yml b/configs/recognition/tpn/metafile.yml index 702da581e0..ce953f2e89 100644 --- a/configs/recognition/tpn/metafile.yml +++ b/configs/recognition/tpn/metafile.yml @@ -66,8 +66,8 @@ Models: Results: - Dataset: SthV1 Metrics: - Top 1 Accuracy: 48.98 - Top 5 Accuracy: 78.91 + Top 1 Accuracy: 51.87 + Top 5 Accuracy: 79.67 Task: Action Recognition Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20220913-d2f5c300.pth + Weights: (https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20230221-940a3615.pth diff --git a/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py b/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py index d833687d6a..b614d725f7 100644 --- a/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py +++ b/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py @@ -8,12 +8,14 @@ ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt' ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt' ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt' + +sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52} train_pipeline = [ dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), dict(type='RawFrameDecode'), dict(type='RandomResizedCrop'), dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map), dict(type='ColorJitter'), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md index ca490117c3..5e5162de83 100644 --- a/configs/recognition/tsm/README.md +++ b/configs/recognition/tsm/README.md @@ -32,11 +32,11 @@ The explosive growth in video streaming gives rise to challenges on performing v ### Something-something V2 -| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | -| :---------------------: | :--------: | :--: | :-------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :---------------------------------: | :-------------------------------: | :------------------------------: | -| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 60.20 | 86.13 | 8 clips x 10 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20221122-446d261a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) | -| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 62.46 | 87.75 | 16 clips x 10 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20221122-b1fb8264.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log) | -| 1x1x8 | 224x224 | 8 | ResNet101 | ImageNet | 60.49 | 85.99 | 8 clips x 10 crop | 62.66G | 42.86M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20221122-cb2cc64e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log) | +| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :---------------------: | :--------: | :--: | :-------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: | +| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 62.72 | 87.70 | 8 clips x 3 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) | +| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 64.16 | 88.61 | 16 clips x 3 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log) | +| 1x1x8 | 224x224 | 8 | ResNet101 | ImageNet | 63.70 | 88.28 | 8 clips x 3 crop | 62.66G | 42.86M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log) | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. diff --git a/configs/recognition/tsm/metafile.yml b/configs/recognition/tsm/metafile.yml index 5adafb069f..64d37461d4 100644 --- a/configs/recognition/tsm/metafile.yml +++ b/configs/recognition/tsm/metafile.yml @@ -178,17 +178,17 @@ Models: Parameters: 23.87M Pretrained: ImageNet Resolution: 224x224 - Training Data: Kinetics-400 + Training Data: SthV2 Training Resources: 8 GPUs Modality: RGB Results: - - Dataset: Kinetics-400 + - Dataset: SthV2 Task: Action Recognition Metrics: - Top 1 Accuracy: 60.20 - Top 5 Accuracy: 86.13 + Top 1 Accuracy: 62.72 + Top 5 Accuracy: 87.70 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20221122-446d261a.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth - Name: tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py @@ -196,22 +196,22 @@ Models: Metadata: Architecture: ResNet50 Batch Size: 16 - Epochs: 100 + Epochs: 50 FLOPs: 65.75G Parameters: 23.87M Pretrained: ImageNet Resolution: 224x224 - Training Data: Kinetics-400 + Training Data: SthV2 Training Resources: 8 GPUs Modality: RGB Results: - - Dataset: Kinetics-400 + - Dataset: SthV2 Task: Action Recognition Metrics: - Top 1 Accuracy: 62.46 - Top 5 Accuracy: 87.75 + Top 1 Accuracy: 64.16 + Top 5 Accuracy: 88.61 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20221122-b1fb8264.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth - Name: tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb Config: configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py @@ -219,19 +219,19 @@ Models: Metadata: Architecture: ResNet101 Batch Size: 16 - Epochs: 100 + Epochs: 50 FLOPs: 62.66G Parameters: 42.86M Pretrained: ImageNet Resolution: 224x224 - Training Data: Kinetics-400 + Training Data: SthV2 Training Resources: 8 GPUs Modality: RGB Results: - - Dataset: Kinetics-400 + - Dataset: SthV2 Task: Action Recognition Metrics: - Top 1 Accuracy: 60.49 - Top 5 Accuracy: 85.99 + Top 1 Accuracy: 63.70 + Top 5 Accuracy: 88.28 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20221122-cb2cc64e.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py index 9429730700..7cb4b48ac7 100644 --- a/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py @@ -1,6 +1,6 @@ _base_ = ['tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py'] # model settings -r101_checkpoint = 'https://download.pytorch.org/models/resnet101-cd907fc2.pth' +r101_checkpoint = 'torchvision://resnet101' model = dict(backbone=dict(pretrained=r101_checkpoint, depth=101)) diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py index 691e39c2b2..36b1eefcf0 100644 --- a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py @@ -4,6 +4,7 @@ file_client_args = dict(io_backend='disk') +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), @@ -17,7 +18,7 @@ max_wh_scale_gap=1, num_fixed_crops=13), dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] @@ -46,7 +47,7 @@ test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), - dict(type='TenCrop', crop_size=224), + dict(type='ThreeCrop', crop_size=256), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py index ba9c393593..8248bcb02b 100644 --- a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py +++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py @@ -11,6 +11,7 @@ file_client_args = dict(io_backend='disk') +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), @@ -24,7 +25,7 @@ max_wh_scale_gap=1, num_fixed_crops=13), dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] @@ -53,7 +54,7 @@ twice_sample=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), - dict(type='TenCrop', crop_size=224), + dict(type='ThreeCrop', crop_size=256), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] diff --git a/configs/recognition/tsn/README.md b/configs/recognition/tsn/README.md index d34d1ab433..1b6e34fdc1 100644 --- a/configs/recognition/tsn/README.md +++ b/configs/recognition/tsn/README.md @@ -32,8 +32,8 @@ Deep convolutional networks have achieved great success for visual recognition i | frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log | | :---------------------: | :-------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :------------------------------: | -----------------------------: | -----------------------------: | -| 1x1x8 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 34.85 | 66.37 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20221122-ad2dbb37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log) | -| 1x1x16 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 36.55 | 68.00 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20221122-ee13c8e2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log) | +| 1x1x8 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 35.51 | 67.09 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20230313-06ad7d03.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log) | +| 1x1x16 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 36.91 | 68.77 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20230221-85bcc1c3.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log) | ### Using backbones from 3rd-party in TSN @@ -49,7 +49,7 @@ It's possible and convenient to use a 3rd-party backbone for TSN under the frame | 1x1x3 | MultiStep | 224x224 | 8 | DenseNet161 | ImageNet | 72.07 | 90.15 | 25 clips x 10 crop | 194.6G | 27.36M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb_20220906-5f4c0daf.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.log) | | 1x1x3 | MultiStep | 224x224 | 8 | Swin Transformer | ImageNet | 77.03 | 92.61 | 25 clips x 10 crop | 386.7G | 87.15M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb_20220906-65ed814e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.log) | -1. Note that some backbones in TIMM are not supported due to multiple reasons. Please refer to to [PR #880](https://github.com/open-mmlab/mmaction2/pull/880) for details. +1. Note that some backbones in TIMM are not supported due to multiple reasons. Please refer to [PR #880](https://github.com/open-mmlab/mmaction2/pull/880) for details. 2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size. 3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available. diff --git a/configs/recognition/tsn/metafile.yml b/configs/recognition/tsn/metafile.yml index e618ed71cc..37943e673b 100644 --- a/configs/recognition/tsn/metafile.yml +++ b/configs/recognition/tsn/metafile.yml @@ -210,10 +210,10 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 34.85 - Top 5 Accuracy: 66.37 + Top 1 Accuracy: 35.51 + Top 5 Accuracy: 67.09 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20221122-ad2dbb37.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20230313-06ad7d03.pth - Name: tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb Config: configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py @@ -233,7 +233,7 @@ Models: - Dataset: Kinetics-400 Task: Action Recognition Metrics: - Top 1 Accuracy: 36.55 - Top 5 Accuracy: 68.00 + Top 1 Accuracy: 36.91 + Top 5 Accuracy: 68.77 Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20221122-ee13c8e2.pth + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20230221-85bcc1c3.pth diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py index 5797a6f596..15fde3ba79 100644 --- a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py @@ -2,6 +2,7 @@ file_client_args = dict(io_backend='disk') +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16), @@ -15,7 +16,7 @@ max_wh_scale_gap=1, num_fixed_crops=13), dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py index 39113ba5b3..a94f7b3b22 100644 --- a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py @@ -14,6 +14,7 @@ file_client_args = dict(io_backend='disk') +sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166} train_pipeline = [ dict(type='DecordInit', **file_client_args), dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), @@ -26,7 +27,7 @@ random_crop=False, max_wh_scale_gap=1), dict(type='Resize', scale=(224, 224), keep_ratio=False), - dict(type='Flip', flip_ratio=0.5), + dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map), dict(type='FormatShape', input_format='NCHW'), dict(type='PackActionInputs') ] From 5f3eb48234faa0d8cc6d51b8bb825beb17da35b0 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Thu, 6 Apr 2023 14:47:08 +0800 Subject: [PATCH 30/36] [Feat] support calculate confusion matrix (#2274) --- mmaction/evaluation/metrics/__init__.py | 4 +- mmaction/evaluation/metrics/acc_metric.py | 209 ++++++++++++++++++ mmaction/structures/action_data_sample.py | 102 ++++++++- tests/evaluation/metrics/test_acc_metric.py | 117 +++++++++- tests/models/recognizers/test_recognizer2d.py | 4 + tools/analysis_tools/confusion_matrix.py | 129 +++++++++++ 6 files changed, 551 insertions(+), 14 deletions(-) create mode 100644 tools/analysis_tools/confusion_matrix.py diff --git a/mmaction/evaluation/metrics/__init__.py b/mmaction/evaluation/metrics/__init__.py index 46988d39c1..0493dae036 100644 --- a/mmaction/evaluation/metrics/__init__.py +++ b/mmaction/evaluation/metrics/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .acc_metric import AccMetric +from .acc_metric import AccMetric, ConfusionMatrix from .anet_metric import ANetMetric from .ava_metric import AVAMetric -__all__ = ['AccMetric', 'AVAMetric', 'ANetMetric'] +__all__ = ['AccMetric', 'AVAMetric', 'ANetMetric', 'ConfusionMatrix'] diff --git a/mmaction/evaluation/metrics/acc_metric.py b/mmaction/evaluation/metrics/acc_metric.py index ca6b4623f8..512b089327 100644 --- a/mmaction/evaluation/metrics/acc_metric.py +++ b/mmaction/evaluation/metrics/acc_metric.py @@ -1,9 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy from collections import OrderedDict +from itertools import product from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +import mmengine import numpy as np +import torch from mmengine.evaluator import BaseMetric from mmaction.evaluation import (get_weighted_score, mean_average_precision, @@ -12,6 +15,17 @@ from mmaction.registry import METRICS +def to_tensor(value): + """Convert value to torch.Tensor.""" + if isinstance(value, np.ndarray): + value = torch.from_numpy(value) + elif isinstance(value, Sequence) and not mmengine.is_str(value): + value = torch.tensor(value) + elif not isinstance(value, torch.Tensor): + raise TypeError(f'{type(value)} is not an available argument.') + return value + + @METRICS.register_module() class AccMetric(BaseMetric): """Accuracy evaluation metric.""" @@ -183,3 +197,198 @@ def label2array(num, label): arr = np.zeros(num, dtype=np.float32) arr[label] = 1. return arr + + +@METRICS.register_module() +class ConfusionMatrix(BaseMetric): + r"""A metric to calculate confusion matrix for single-label tasks. + + Args: + num_classes (int, optional): The number of classes. Defaults to None. + collect_device (str): Device name used for collecting results from + different ranks during distributed training. Must be 'cpu' or + 'gpu'. Defaults to 'cpu'. + prefix (str, optional): The prefix that will be added in the metric + names to disambiguate homonymous metrics of different evaluators. + If prefix is not provided in the argument, self.default_prefix + will be used instead. Defaults to None. + + Examples: + + 1. The basic usage. + + >>> import torch + >>> from mmaction.evaluation import ConfusionMatrix + >>> y_pred = [0, 1, 1, 3] + >>> y_true = [0, 2, 1, 3] + >>> ConfusionMatrix.calculate(y_pred, y_true, num_classes=4) + tensor([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 1, 0, 0], + [0, 0, 0, 1]]) + >>> # plot the confusion matrix + >>> import matplotlib.pyplot as plt + >>> y_score = torch.rand((1000, 10)) + >>> y_true = torch.randint(10, (1000, )) + >>> matrix = ConfusionMatrix.calculate(y_score, y_true) + >>> ConfusionMatrix().plot(matrix) + >>> plt.show() + + 2. In the config file + + .. code:: python + + val_evaluator = dict(type='ConfusionMatrix') + test_evaluator = dict(type='ConfusionMatrix') + """ # noqa: E501 + default_prefix = 'confusion_matrix' + + def __init__(self, + num_classes: Optional[int] = None, + collect_device: str = 'cpu', + prefix: Optional[str] = None) -> None: + super().__init__(collect_device, prefix) + + self.num_classes = num_classes + + def process(self, data_batch, data_samples: Sequence[dict]) -> None: + for data_sample in data_samples: + pred_scores = data_sample.get('pred_scores') + gt_label = data_sample['gt_labels']['item'] + if pred_scores is not None: + pred_label = pred_scores['item'].argmax(dim=0, keepdim=True) + self.num_classes = pred_scores['item'].size(0) + else: + pred_label = data_sample['pred_labels']['item'] + + self.results.append({ + 'pred_label': pred_label, + 'gt_label': gt_label + }) + + def compute_metrics(self, results: list) -> dict: + pred_labels = [] + gt_labels = [] + for result in results: + pred_labels.append(result['pred_label']) + gt_labels.append(result['gt_label']) + confusion_matrix = ConfusionMatrix.calculate( + torch.cat(pred_labels), + torch.cat(gt_labels), + num_classes=self.num_classes) + return {'result': confusion_matrix} + + @staticmethod + def calculate(pred, target, num_classes=None) -> dict: + """Calculate the confusion matrix for single-label task. + + Args: + pred (torch.Tensor | np.ndarray | Sequence): The prediction + results. It can be labels (N, ), or scores of every + class (N, C). + target (torch.Tensor | np.ndarray | Sequence): The target of + each prediction with shape (N, ). + num_classes (Optional, int): The number of classes. If the ``pred`` + is label instead of scores, this argument is required. + Defaults to None. + + Returns: + torch.Tensor: The confusion matrix. + """ + pred = to_tensor(pred) + target_label = to_tensor(target).int() + + assert pred.size(0) == target_label.size(0), \ + f"The size of pred ({pred.size(0)}) doesn't match "\ + f'the target ({target_label.size(0)}).' + assert target_label.ndim == 1 + + if pred.ndim == 1: + assert num_classes is not None, \ + 'Please specify the `num_classes` if the `pred` is labels ' \ + 'intead of scores.' + pred_label = pred + else: + num_classes = num_classes or pred.size(1) + pred_label = torch.argmax(pred, dim=1).flatten() + + with torch.no_grad(): + indices = num_classes * target_label + pred_label + matrix = torch.bincount(indices, minlength=num_classes**2) + matrix = matrix.reshape(num_classes, num_classes) + + return matrix + + @staticmethod + def plot(confusion_matrix: torch.Tensor, + include_values: bool = False, + cmap: str = 'viridis', + classes: Optional[List[str]] = None, + colorbar: bool = True, + show: bool = True): + """Draw a confusion matrix by matplotlib. + + Modified from `Scikit-Learn + `_ + + Args: + confusion_matrix (torch.Tensor): The confusion matrix to draw. + include_values (bool): Whether to draw the values in the figure. + Defaults to False. + cmap (str): The color map to use. Defaults to use "viridis". + classes (list[str], optional): The names of categories. + Defaults to None, which means to use index number. + colorbar (bool): Whether to show the colorbar. Defaults to True. + show (bool): Whether to show the figure immediately. + Defaults to True. + """ # noqa: E501 + import matplotlib.pyplot as plt + + fig, ax = plt.subplots(figsize=(10, 10)) + + num_classes = confusion_matrix.size(0) + + im_ = ax.imshow(confusion_matrix, interpolation='nearest', cmap=cmap) + text_ = None + cmap_min, cmap_max = im_.cmap(0), im_.cmap(1.0) + + if include_values: + text_ = np.empty_like(confusion_matrix, dtype=object) + + # print text with appropriate color depending on background + thresh = (confusion_matrix.max() + confusion_matrix.min()) / 2.0 + + for i, j in product(range(num_classes), range(num_classes)): + color = cmap_max if confusion_matrix[i, + j] < thresh else cmap_min + + text_cm = format(confusion_matrix[i, j], '.2g') + text_d = format(confusion_matrix[i, j], 'd') + if len(text_d) < len(text_cm): + text_cm = text_d + + text_[i, j] = ax.text( + j, i, text_cm, ha='center', va='center', color=color) + + display_labels = classes or np.arange(num_classes) + + if colorbar: + fig.colorbar(im_, ax=ax) + ax.set( + xticks=np.arange(num_classes), + yticks=np.arange(num_classes), + xticklabels=display_labels, + yticklabels=display_labels, + ylabel='True label', + xlabel='Predicted label', + ) + ax.invert_yaxis() + ax.xaxis.tick_top() + + ax.set_ylim((num_classes - 0.5, -0.5)) + # Automatically rotate the x labels. + fig.autofmt_xdate(ha='center') + + if show: + plt.show() + return fig diff --git a/mmaction/structures/action_data_sample.py b/mmaction/structures/action_data_sample.py index c75f6654a1..196b080136 100644 --- a/mmaction/structures/action_data_sample.py +++ b/mmaction/structures/action_data_sample.py @@ -1,25 +1,105 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Union +from numbers import Number +from typing import Sequence, Union import numpy as np import torch from mmengine.structures import BaseDataElement, InstanceData, LabelData +from mmengine.utils import is_str + + +def format_label(value: Union[torch.Tensor, np.ndarray, Sequence, + int]) -> torch.Tensor: + """Convert various python types to label-format tensor. + + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`, :class:`int`. + + Args: + value (torch.Tensor | numpy.ndarray | Sequence | int): Label value. + + Returns: + :obj:`torch.Tensor`: The foramtted label tensor. + """ + + # Handle single number + if isinstance(value, (torch.Tensor, np.ndarray)) and value.ndim == 0: + value = int(value.item()) + + if isinstance(value, np.ndarray): + value = torch.from_numpy(value).to(torch.long) + elif isinstance(value, Sequence) and not is_str(value): + value = torch.tensor(value).to(torch.long) + elif isinstance(value, int): + value = torch.LongTensor([value]) + elif not isinstance(value, torch.Tensor): + raise TypeError(f'Type {type(value)} is not an available label type.') + assert value.ndim == 1, \ + f'The dims of value should be 1, but got {value.ndim}.' + + return value + + +def format_score(value: Union[torch.Tensor, np.ndarray, + Sequence]) -> torch.Tensor: + """Convert various python types to score-format tensor. + + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`. + + Args: + value (torch.Tensor | numpy.ndarray | Sequence): Score values. + + Returns: + :obj:`torch.Tensor`: The foramtted score tensor. + """ + + if isinstance(value, np.ndarray): + value = torch.from_numpy(value).float() + elif isinstance(value, Sequence) and not is_str(value): + value = torch.tensor(value).float() + elif not isinstance(value, torch.Tensor): + raise TypeError(f'Type {type(value)} is not an available label type.') + assert value.ndim == 1, \ + f'The dims of value should be 1, but got {value.ndim}.' + + return value class ActionDataSample(BaseDataElement): - def set_gt_labels(self, value: Union[int, - np.ndarray]) -> 'ActionDataSample': + def set_gt_labels( + self, value: Union[np.ndarray, torch.Tensor, Sequence[Number], Number] + ) -> 'ActionDataSample': """Set label of ``gt_labels``.""" - if isinstance(value, int): - value = torch.LongTensor([value]) - elif isinstance(value, np.ndarray): - value = torch.from_numpy(value) - else: - raise TypeError(f'Type {type(value)} is not an ' - f'available label type.') + label_data = getattr(self, '_gt_label', LabelData()) + label_data.item = format_label(value) + self.gt_labels = label_data + return self - self.gt_labels = LabelData(item=value) + def set_pred_label( + self, value: Union[np.ndarray, torch.Tensor, Sequence[Number], Number] + ) -> 'ActionDataSample': + """Set label of ``pred_label``.""" + label_data = getattr(self, '_pred_label', LabelData()) + label_data.item = format_label(value) + self.pred_labels = label_data + return self + + def set_pred_score(self, value: torch.Tensor) -> 'ActionDataSample': + """Set score of ``pred_label``.""" + label_data = getattr(self, '_pred_label', LabelData()) + label_data.item = format_score(value) + if hasattr(self, 'num_classes'): + assert len(label_data.item) == self.num_classes, \ + f'The length of score {len(label_data.item)} should be '\ + f'equal to the num_classes {self.num_classes}.' + else: + self.set_field( + name='num_classes', + value=len(label_data.item), + field_type='metainfo') + self.pred_scores = label_data return self @property diff --git a/tests/evaluation/metrics/test_acc_metric.py b/tests/evaluation/metrics/test_acc_metric.py index 273155858c..7c70adb7d6 100644 --- a/tests/evaluation/metrics/test_acc_metric.py +++ b/tests/evaluation/metrics/test_acc_metric.py @@ -1,7 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. +from unittest import TestCase + +import numpy as np import torch -from mmaction.evaluation import AccMetric +from mmaction.evaluation import AccMetric, ConfusionMatrix +from mmaction.registry import METRICS +from mmaction.structures import ActionDataSample def generate_data(num_classes=5, random_label=False): @@ -41,3 +46,113 @@ def test_accmetric(): assert eval_results['mean1'] == 1.0 assert eval_results['mmit_mean_average_precision'] == 1.0 return + + +class TestConfusionMatrix(TestCase): + + def test_evaluate(self): + """Test using the metric in the same way as Evalutor.""" + pred = [ + ActionDataSample().set_pred_score(i).set_pred_label( + j).set_gt_labels(k).to_dict() for i, j, k in zip([ + torch.tensor([0.7, 0.0, 0.3]), + torch.tensor([0.5, 0.2, 0.3]), + torch.tensor([0.4, 0.5, 0.1]), + torch.tensor([0.0, 0.0, 1.0]), + torch.tensor([0.0, 0.0, 1.0]), + torch.tensor([0.0, 0.0, 1.0]), + ], [0, 0, 1, 2, 2, 2], [0, 0, 1, 2, 1, 0]) + ] + + # Test with score (use score instead of label if score exists) + metric = METRICS.build(dict(type='ConfusionMatrix')) + metric.process(None, pred) + res = metric.evaluate(6) + self.assertIsInstance(res, dict) + self.assertTensorEqual( + res['confusion_matrix/result'], + torch.tensor([ + [2, 0, 1], + [0, 1, 1], + [0, 0, 1], + ])) + + # Test with label + for sample in pred: + del sample['pred_scores'] + metric = METRICS.build(dict(type='ConfusionMatrix')) + metric.process(None, pred) + with self.assertRaisesRegex(AssertionError, + 'Please specify the `num_classes`'): + metric.evaluate(6) + + metric = METRICS.build(dict(type='ConfusionMatrix', num_classes=3)) + metric.process(None, pred) + self.assertIsInstance(res, dict) + self.assertTensorEqual( + res['confusion_matrix/result'], + torch.tensor([ + [2, 0, 1], + [0, 1, 1], + [0, 0, 1], + ])) + + def test_calculate(self): + y_true = np.array([0, 0, 1, 2, 1, 0]) + y_label = torch.tensor([0, 0, 1, 2, 2, 2]) + y_score = [ + [0.7, 0.0, 0.3], + [0.5, 0.2, 0.3], + [0.4, 0.5, 0.1], + [0.0, 0.0, 1.0], + [0.0, 0.0, 1.0], + [0.0, 0.0, 1.0], + ] + + # Test with score + cm = ConfusionMatrix.calculate(y_score, y_true) + self.assertIsInstance(cm, torch.Tensor) + self.assertTensorEqual( + cm, torch.tensor([ + [2, 0, 1], + [0, 1, 1], + [0, 0, 1], + ])) + + # Test with label + with self.assertRaisesRegex(AssertionError, + 'Please specify the `num_classes`'): + ConfusionMatrix.calculate(y_label, y_true) + + cm = ConfusionMatrix.calculate(y_label, y_true, num_classes=3) + self.assertIsInstance(cm, torch.Tensor) + self.assertTensorEqual( + cm, torch.tensor([ + [2, 0, 1], + [0, 1, 1], + [0, 0, 1], + ])) + + # Test with invalid inputs + with self.assertRaisesRegex(TypeError, " is not"): + ConfusionMatrix.calculate(y_label, 'hi') + + def test_plot(self): + import matplotlib.pyplot as plt + + cm = torch.tensor([[2, 0, 1], [0, 1, 1], [0, 0, 1]]) + fig = ConfusionMatrix.plot(cm, include_values=True, show=False) + + self.assertIsInstance(fig, plt.Figure) + + def assertTensorEqual(self, + tensor: torch.Tensor, + value: float, + msg=None, + **kwarg): + tensor = tensor.to(torch.float32) + value = torch.tensor(value).float() + try: + torch.testing.assert_allclose(tensor, value, **kwarg) + except AssertionError as e: + self.fail(self._formatMessage(msg, str(e))) diff --git a/tests/models/recognizers/test_recognizer2d.py b/tests/models/recognizers/test_recognizer2d.py index 1acde7fc9c..773bc0806f 100644 --- a/tests/models/recognizers/test_recognizer2d.py +++ b/tests/models/recognizers/test_recognizer2d.py @@ -1,4 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +import platform + +import pytest import torch from mmaction.registry import MODELS @@ -191,6 +194,7 @@ def test_trn(): recognizer(one_img, gradcam=True) +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') def test_tpn(): register_all_modules() config = get_recognizer_cfg( diff --git a/tools/analysis_tools/confusion_matrix.py b/tools/analysis_tools/confusion_matrix.py new file mode 100644 index 0000000000..224b8364bc --- /dev/null +++ b/tools/analysis_tools/confusion_matrix.py @@ -0,0 +1,129 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import tempfile + +import torch +from mmengine import dump, list_from_file, load +from mmengine.config import Config, DictAction +from mmengine.evaluator import Evaluator +from mmengine.runner import Runner + +from mmaction.evaluation import ConfusionMatrix +from mmaction.registry import DATASETS +from mmaction.utils import register_all_modules + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Eval a checkpoint and draw the confusion matrix.') + parser.add_argument('config', help='test config file path') + parser.add_argument( + 'ckpt_or_result', + type=str, + help='The checkpoint file (.pth) or ' + 'dumpped predictions pickle file (.pkl).') + parser.add_argument('--out', help='the file to save the confusion matrix.') + parser.add_argument( + '--show', + action='store_true', + help='whether to display the metric result by matplotlib if supports.') + parser.add_argument( + '--show-path', type=str, help='Path to save the visualization image.') + parser.add_argument( + '--include-values', + action='store_true', + help='To draw the values in the figure.') + parser.add_argument('--label-file', default=None, help='Labelmap file') + parser.add_argument( + '--target-classes', + type=int, + nargs='+', + default=[], + help='Selected classes to evaluate, and remains will be neglected') + parser.add_argument( + '--cmap', + type=str, + default='viridis', + help='The color map to use. Defaults to "viridis".') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + # register all modules in mmaction into the registries + # do not init the default scope here because it will be init in the runner + register_all_modules(init_default_scope=False) + + # load config + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + if args.ckpt_or_result.endswith('.pth'): + # Set confusion matrix as the metric. + cfg.test_evaluator = dict(type='ConfusionMatrix') + + cfg.load_from = str(args.ckpt_or_result) + + with tempfile.TemporaryDirectory() as tmpdir: + cfg.work_dir = tmpdir + runner = Runner.from_cfg(cfg) + classes = runner.test_loop.dataloader.dataset.metainfo.get( + 'classes') + cm = runner.test()['confusion_matrix/result'] + else: + predictions = load(args.ckpt_or_result) + evaluator = Evaluator(ConfusionMatrix()) + metrics = evaluator.offline_evaluate(predictions, None) + cm = metrics['confusion_matrix/result'] + try: + # Try to build the dataset. + dataset = DATASETS.build({ + **cfg.test_dataloader.dataset, 'pipeline': [] + }) + classes = dataset.metainfo.get('classes') + except Exception: + classes = None + + if args.label_file is not None: + classes = list_from_file(args.label_file) + if classes is None: + num_classes = cm.shape[0] + classes = list(range(num_classes)) + + if args.target_classes: + assert len(args.target_classes) > 1, \ + 'please ensure select more than one class' + target_idx = torch.tensor(args.target_classes) + cm = cm[target_idx][:, target_idx] + classes = [classes[idx] for idx in target_idx] + + if args.out is not None: + dump(cm, args.out) + + if args.show or args.show_path is not None: + fig = ConfusionMatrix.plot( + cm, + show=args.show, + classes=classes, + include_values=args.include_values, + cmap=args.cmap) + if args.show_path is not None: + fig.savefig(args.show_path) + print(f'The confusion matrix is saved at {args.show_path}.') + + +if __name__ == '__main__': + main() From db11ac2c372f92887b42b63af73da195fb01618b Mon Sep 17 00:00:00 2001 From: Kai Hu Date: Thu, 6 Apr 2023 03:39:57 -0400 Subject: [PATCH 31/36] [Doc] for README, optimizers and data pipeline (#2341) --- README.md | 111 +++--- .../en/advanced_guides/customize_optimizer.md | 329 ++++++++++++++++++ docs/en/advanced_guides/customize_pipeline.md | 152 ++++++++ 3 files changed, 549 insertions(+), 43 deletions(-) create mode 100644 docs/en/advanced_guides/customize_optimizer.md create mode 100644 docs/en/advanced_guides/customize_pipeline.md diff --git a/README.md b/README.md index d08d49d2c3..25b703306c 100644 --- a/README.md +++ b/README.md @@ -56,33 +56,51 @@ English | [简体中文](/README_zh-CN.md) -## Introduction +## 📄 Table of Contents + +- [🥳 🚀 What's New](#--whats-new-) +- [📖 Introduction](#-introduction-) +- [🎁 Major Features](#-major-features-) +- [🛠️ Installation](#-installation-) +- [👀 Model Zoo](#-model-zoo-) +- [👨‍🏫 Get Started](#-get-started-) +- [🎫 License](#-license-) +- [🖊️ Citation](#️-citation-) +- [🙌 Contributing](#-contributing-) +- [🤝 Acknowledgement](#-acknowledgement-) +- [🏗️ Projects in OpenMMLab](#-projects-in-openmmlab-) + +## 🥳 🚀 What's New [🔝](#-table-of-contents) + +**The default branch has been switched to `1.x` from `master`, and we encourage users to migrate to the latest version with more supported models, stronger pre-training checkpoints and simpler coding. Please refer to [Migration Guide](https://mmaction2.readthedocs.io/en/1.x/migration.html) for more details.** + +**Release (2023.02.10)**: v1.0.0rc3 with the following new features: + +- Support Action Recognition model UniFormer V1(ICLR'2022), UniFormer V2(Arxiv'2022). +- Support training MViT V2(CVPR'2022), and MaskFeat(CVPR'2022) fine-tuning. +- Add a new handy interface for inference MMAction2 models ([demo](https://github.com/open-mmlab/mmaction2/blob/dev-1.x/demo/README.md#inferencer)) + +## 📖 Introduction [🔝](#-table-of-contents) MMAction2 is an open-source toolbox for video understanding based on PyTorch. It is a part of the [OpenMMLab](http://openmmlab.com/) project. -The 1.x branch works with **PyTorch 1.6+**. -
-
-
-

Action Recognition Results on Kinetics-400

-
-
-
-

Skeleton-based Action Recognition Results on NTU-RGB+D-120

-
+ + +

Action Recognition on Kinetics-400 (left) and Skeleton-based Action Recognition on NTU-RGB+D-120 (right)

+

Skeleton-based Spatio-Temporal Action Detection and Action Recognition Results on Kinetics-400

-
+

Spatio-Temporal Action Detection Results on AVA-2.1

-## Major Features +## 🎁 Major Features [🔝](#-table-of-contents) - **Modular design**: We decompose a video understanding framework into different components. One can easily construct a customized video understanding framework by combining different modules. @@ -90,17 +108,14 @@ The 1.x branch works with **PyTorch 1.6+**. - **Well tested and documented**: We provide detailed documentation and API reference, as well as unit tests. -## What's New +## 🛠️ Installation [🔝](#-table-of-contents) -**Release (2023.02.10)**: v1.0.0rc3 with the following new features: +MMAction2 depends on [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine), [MMDetection](https://github.com/open-mmlab/mmdetection) (optional, for spatial-temporal detection tasks) and [MMPose](https://github.com/open-mmlab/mmpose) (optional, for skeleton based tasks). -- Support Action Recognition model UniFormer V1(ICLR'2022), UniFormer V2(Arxiv'2022). -- Support training MViT V2(CVPR'2022), and MaskFeat(CVPR'2022) fine-tuning. -- Add a new handy interface for inference MMAction2 models ([demo](https://github.com/open-mmlab/mmaction2/blob/dev-1.x/demo/README.md#inferencer)) +Please refer to [install.md](https://mmaction2.readthedocs.io/en/1.x/get_started.html) for detailed instructions. -## Installation - -Please refer to [install.md](https://mmaction2.readthedocs.io/en/1.x/get_started.html) for more detailed instructions. +
+Quick instructions ```shell conda create --name openmmlab python=3.8 -y @@ -116,7 +131,15 @@ git checkout 1.x pip3 install -e . ``` -## Supported Methods +
+ +## 👀 Model Zoo [🔝](#-table-of-contents) + +Results and models are available in the [model zoo](https://mmaction2.readthedocs.io/en/1.x/modelzoo.html). + +
+ +Supported model @@ -161,7 +184,6 @@ pip3 install -e . - @@ -185,17 +207,19 @@ pip3 install -e . + + + +
Action Localization
SSN (ICCV'2017) BSN (ECCV'2018) BMN (ICCV'2019) 2s-AGCN (CVPR'2019) PoseC3D (CVPR'2022) STGCN++ (ArXiv'2022)CTRGCN (CVPR'2021)
MSG3D (CVPR'2020)
-Results and models are available in the *README.md* of each method's config directory. -A summary can be found on the [**model zoo**](https://mmaction2.readthedocs.io/en/1.x/modelzoo.html) page. +
-We will keep up with the latest progress of the community and support more popular algorithms and frameworks. -If you have any feature requests, please feel free to leave a comment in [Issues](https://github.com/open-mmlab/mmaction2/issues/19). +
-## Supported Datasets +Supported dataset @@ -254,31 +278,32 @@ If you have any feature requests, please feel free to leave a comment in [Issues
-Datasets marked with * are not fully supported yet, but related dataset preparation steps are provided. A summary can be found on the [**Supported Datasets**](https://mmaction2.readthedocs.io/en/latest/supported_datasets.html) page. - -## Data Preparation - -Please refer to [data_preparation.md](docs/en/user_guides/2_data_prepare.md) for a general knowledge of data preparation. +
-## FAQ +## 👨‍🏫 Get Started [🔝](#-table-of-contents) -Please refer to [FAQ](docs/en/notes/faq.md) for frequently asked questions. +For tutorials, we provide the following user guides for basic usage: -## Projects built on MMAction2 +- [Migration from MMAction2 0.X](https://mmaction2.readthedocs.io/en/1.x/migration.html) +- [Learn about Configs](https://mmaction2.readthedocs.io/en/1.x/user_guides/1_config.html#) +- [Prepare Datasets](https://mmaction2.readthedocs.io/en/1.x/user_guides/2_data_prepare.html) +- [Inference with Existing Models](https://mmaction2.readthedocs.io/en/1.x/user_guides/3_inference.html) +- [Training and Testing](https://mmaction2.readthedocs.io/en/1.x/user_guides/4_train_test.html) -Currently, there are many research works and projects built on MMAction2 by users from community, such as: +
+Research works built on MMAction2 by users from community - Video Swin Transformer. [\[paper\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer) - Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR) - Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS) -etc., check [projects.md](docs/en/notes/projects.md) to see all related projects. +
-## License +## 🎫 License [🔝](#-table-of-contents) This project is released under the [Apache 2.0 license](LICENSE). -## Citation +## 🖊️ Citation [🔝](#-table-of-contents) If you find this project useful in your research, please consider cite: @@ -291,17 +316,17 @@ If you find this project useful in your research, please consider cite: } ``` -## Contributing +## 🙌 Contributing [🔝](#-table-of-contents) We appreciate all contributions to improve MMAction2. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING.md) in MMCV for more details about the contributing guideline. -## Acknowledgement +## 🤝 Acknowledgement [🔝](#-table-of-contents) MMAction2 is an open-source project that is contributed by researchers and engineers from various colleges and companies. We appreciate all the contributors who implement their methods or add new features and users who give valuable feedback. We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and develop their new models. -## Projects in OpenMMLab +## 🏗️ Projects in OpenMMLab [🔝](#-table-of-contents) - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models. - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision. diff --git a/docs/en/advanced_guides/customize_optimizer.md b/docs/en/advanced_guides/customize_optimizer.md new file mode 100644 index 0000000000..d69aa0ff90 --- /dev/null +++ b/docs/en/advanced_guides/customize_optimizer.md @@ -0,0 +1,329 @@ +# Customize Optimizer + +In this tutorial, we will introduce some methods about how to build the optimizer and learning rate scheduler for your tasks. + +- [Customize Optimizer](#customize-optimizer) + - [Build optimizers using optim_wrapper](#build-optimizers-using-optim_wrapper) + - [Customize parameter schedules](#customize-parameter-schedules) + - [Add new optimizers or constructors](#add-new-optimizers-or-constructors) + +## Build optimizers using optim_wrapper + +We use the `optim_wrapper` field to configure the strategies of optimization, which includes choices of the optimizer, parameter-wise configurations, gradient clipping and accumulation. A simple example can be: + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='SGD', lr=0.0003, weight_decay=0.0001) +) +``` + +In the above example, a SGD optimizer with learning rate 0.0003 and weight decay 0.0001 is built. + +### Use optimizers supported by PyTorch + +We support all the optimizers implemented by PyTorch. To use a different optimizer, just need to change the `optimizer` field of config files. For example, if you want to use `torch.optim.Adam`, the modification in the config file could be as the following. + +```python +optim_wrapper = dict( + type='OptimWrapper', + optimizer = dict( + type='Adam', + lr=0.001, + betas=(0.9, 0.999), + eps=1e-08, + weight_decay=0, + amsgrad=False), +) +``` + +First we need to change the value of `type` to the desired optimizer name supported in `torch.optim`. Next we add necessary arguments of this optimizer to the `optimizer` field. The above config will build the following optimizer: + +```python +torch.optim.Adam(lr=0.001, + betas=(0.9, 0.999), + eps=1e-08, + weight_decay=0, + amsgrad=False) +``` + +### Parameter-wise finely configuration + +Some models may have parameter-specific settings for optimization, for example, no weight decay to the BatchNorm layers or using different learning rates for different network layers. +To finely configure them, we can use the `paramwise_cfg` argument in `optim_wrapper`. + +- **Set different hyper-parameter multipliers for different types of parameters.** + + For instance, we can set `norm_decay_mult=0.` in `paramwise_cfg` to change the weight decay of weight and bias of normalization layers to zero. + + ```python + optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.8, weight_decay=1e-4), + paramwise_cfg=dict(norm_decay_mult=0.)) + ``` + + More types of parameters are supported to configured, list as follow: + + - `lr_mult`: Multiplier for learning rate of all parameters. + - `decay_mult`: Multiplier for weight decay of all parameters. + - `bias_lr_mult`: Multiplier for learning rate of bias (Not include normalization layers' biases and deformable convolution layers' offsets). Defaults to 1. + - `bias_decay_mult`: Multiplier for weight decay of bias (Not include normalization layers' biases and deformable convolution layers' offsets). Defaults to 1. + - `norm_decay_mult`: Multiplier for weight decay of weigh and bias of normalization layers. Defaults to 1. + - `dwconv_decay_mult`: Multiplier for weight decay of depth-wise convolution layers. Defaults to 1. + - `bypass_duplicate`: Whether to bypass duplicated parameters. Defaults to `False`. + - `dcn_offset_lr_mult`: Multiplier for learning rate of deformable convolution layers. Defaults to 1. + +- **Set different hyper-parameter multipliers for specific parameters.** + + MMAction2 can use `custom_keys` in `paramwise_cfg` to specify different parameters to use different learning rates or weight decay. + + For example, to set all learning rates and weight decays of `backbone.layer0` to 0, the rest of `backbone` remains the same as the optimizer and the learning rate of `head` to 0.001, use the configs below. + + ```python + optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001), + paramwise_cfg=dict( + custom_keys={ + 'backbone.layer0': dict(lr_mult=0, decay_mult=0), + 'backbone': dict(lr_mult=1), + 'head': dict(lr_mult=0.1) + })) + ``` + +### Gradient clipping + +During the training process, the loss function may get close to a cliffy region and cause gradient explosion. And gradient clipping is helpful to stabilize the training process. More introduction can be found in [this page](https://paperswithcode.com/method/gradient-clipping). + +Currently we support `clip_grad` option in `optim_wrapper` for gradient clipping, refers to [PyTorch Documentation](torch.nn.utils.clip_grad_norm_). + +Here is an example: + +```python +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001), + # norm_type: type of the used p-norm, here norm_type is 2. + clip_grad=dict(max_norm=35, norm_type=2)) +``` + +### Gradient accumulation + +When computing resources are lacking, the batch size can only be set to a small value, which may affect the performance of models. Gradient accumulation can be used to solve this problem. We support `accumulative_counts` option in `optim_wrapper` for gradient accumulation. + +Here is an example: + +```python +train_dataloader = dict(batch_size=64) +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001), + accumulative_counts=4) +``` + +Indicates that during training, back-propagation is performed every 4 iters. And the above is equivalent to: + +```python +train_dataloader = dict(batch_size=256) +optim_wrapper = dict( + optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001)) +``` + +## Customize parameter schedules + +In training, the optimzation parameters such as learing rate, momentum, are usually not fixed but changing through iterations or epochs. PyTorch supports several learning rate schedulers, which are not sufficient for complex strategies. In MMAction2, we provide `param_scheduler` for better controls of different parameter schedules. + +### Customize learning rate schedules + +Learning rate schedulers are widely used to improve performance. We support most of the PyTorch schedulers, including `ExponentialLR`, `LinearLR`, `StepLR`, `MultiStepLR`, etc. + +All available learning rate scheduler can be found {external+mmengine:ref}`here `, and the +names of learning rate schedulers end with `LR`. + +- **Single learning rate schedule** + + In most cases, we use only one learning rate schedule for simplicity. For instance, [`MultiStepLR`](mmengine.optim.MultiStepLR) is used as the default learning rate schedule for ResNet. Here, `param_scheduler` is a dictionary. + + ```python + param_scheduler = dict( + type='MultiStepLR', + by_epoch=True, + milestones=[100, 150], + gamma=0.1) + ``` + + Or, we want to use the [`CosineAnnealingLR`](mmengine.optim.CosineAnnealingLR) scheduler to decay the learning rate: + + ```python + param_scheduler = dict( + type='CosineAnnealingLR', + by_epoch=True, + T_max=num_epochs) + ``` + +- **Multiple learning rate schedules** + + In some of the training cases, multiple learning rate schedules are applied for higher accuracy. For example ,in the early stage, training is easy to be volatile, and warmup is a technique to reduce volatility. + The learning rate will increase gradually from a minor value to the expected value by warmup and decay afterwards by other schedules. + + In MMAction2, simply combines desired schedules in `param_scheduler` as a list can achieve the warmup strategy. + + Here are some examples: + + 1. linear warmup during the first 50 iters. + + ```python + param_scheduler = [ + # linear warm-up by iters + dict(type='LinearLR', + start_factor=0.001, + by_epoch=False, # by iters + end=50), # only warm up for first 50 iters + # main learing rate schedule + dict(type='MultiStepLR', + by_epoch=True, + milestones=[8, 11], + gamma=0.1) + ] + ``` + + 2. linear warmup and update lr by iter during the first 10 epochs. + + ```python + param_scheduler = [ + # linear warm-up by epochs in [0, 10) epochs + dict(type='LinearLR', + start_factor=0.001, + by_epoch=True, + end=10, + convert_to_iter_based=True, # Update learning rate by iter. + ), + # use CosineAnnealing schedule after 10 epochs + dict(type='CosineAnnealingLR', by_epoch=True, begin=10) + ] + ``` + + Notice that, we use `begin` and `end` arguments here to assign the valid range, which is \[`begin`, `end`) for this schedule. And the range unit is defined by `by_epoch` argument. If not specified, the `begin` is 0 and the `end` is the max epochs or iterations. + + If the ranges for all schedules are not continuous, the learning rate will stay constant in ignored range, otherwise all valid schedulers will be executed in order in a specific stage, which behaves the same as PyTorch [`ChainedScheduler`](torch.optim.lr_scheduler.ChainedScheduler). + +### Customize momentum schedules + +We support using momentum schedulers to modify the optimizer's momentum according to learning rate, which could make the loss converge in a faster way. The usage is the same as learning rate schedulers. + +All available learning rate scheduler can be found {external+mmengine:ref}`here `, and the +names of momentum rate schedulers end with `Momentum`. + +Here is an example: + +```python +param_scheduler = [ + # the lr scheduler + dict(type='LinearLR', ...), + # the momentum scheduler + dict(type='LinearMomentum', + start_factor=0.001, + by_epoch=False, + begin=0, + end=1000) +] +``` + +## Add new optimizers or constructors + +This part will modify the MMAction2 source code or add code to the MMAction2 framework, beginners can skip it. + +### Add new optimizers + +In academic research and industrial practice, it may be necessary to use optimization methods not implemented by MMAction2, and you can add them through the following methods. + +#### 1. Implement a new optimizer + +Assume you want to add an optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`. +You need to create a new file under `mmaction/engine/optimizers`, and implement the new optimizer in the file, for example, in `mmaction/engine/optimizers/my_optimizer.py`: + +```python +from torch.optim import Optimizer +from mmaction.registry import OPTIMIZERS + + +@OPTIMIZERS.register_module() +class MyOptimizer(Optimizer): + + def __init__(self, a, b, c): + ... + + def step(self, closure=None): + ... +``` + +#### 2. Import the optimizer + +To find the above module defined above, this module should be imported during the running. First import it in the `mmaction/engine/optimizers/__init__.py` to add it into the `mmaction.engine` package. + +```python +# In mmaction/engine/optimizers/__init__.py +... +from .my_optimizer import MyOptimizer # MyOptimizer maybe other class name + +__all__ = [..., 'MyOptimizer'] +``` + +During running, we will automatically import the `mmaction.engine` package and register the `MyOptimizer` at the same time. + +#### 3. Specify the optimizer in the config file + +Then you can use `MyOptimizer` in the `optim_wrapper.optimizer` field of config files. + +```python +optim_wrapper = dict( + optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)) +``` + +### Add new optimizer constructors + +Some models may have some parameter-specific settings for optimization, like different weight decay rate for all `BatchNorm` layers. + +Although we already can use [the `optim_wrapper.paramwise_cfg` field](#parameter-wise-finely-configuration) to +configure various parameter-specific optimizer settings. It may still not cover your need. + +Of course, you can modify it. By default, we use the [`DefaultOptimWrapperConstructor`](mmengine.optim.DefaultOptimWrapperConstructor) +class to deal with the construction of optimizer. And during the construction, it fine-grainedly configures the optimizer settings of +different parameters according to the `paramwise_cfg`,which could also serve as a template for new optimizer constructor. + +You can overwrite these behaviors by add new optimizer constructors. + +```python +# In mmaction/engine/optimizers/my_optim_constructor.py +from mmengine.optim import DefaultOptimWrapperConstructor +from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class MyOptimWrapperConstructor: + + def __init__(self, optim_wrapper_cfg, paramwise_cfg=None): + ... + + def __call__(self, model): + ... +``` + +And then, import it and use it almost like [the optimizer tutorial](#add-new-optimizers). + +1. Import it in the `mmaction/engine/optimizers/__init__.py` to add it into the `mmaction.engine` package. + + ```python + # In mmaction/engine/optimizers/__init__.py + ... + from .my_optim_constructor import MyOptimWrapperConstructor + + __all__ = [..., 'MyOptimWrapperConstructor'] + ``` + +2. Use `MyOptimWrapperConstructor` in the `optim_wrapper.constructor` field of config files. + + ```python + optim_wrapper = dict( + constructor=dict(type='MyOptimWrapperConstructor'), + optimizer=..., + paramwise_cfg=..., + ) + ``` diff --git a/docs/en/advanced_guides/customize_pipeline.md b/docs/en/advanced_guides/customize_pipeline.md new file mode 100644 index 0000000000..719f806d3f --- /dev/null +++ b/docs/en/advanced_guides/customize_pipeline.md @@ -0,0 +1,152 @@ +# Customize Data Pipeline + +In this tutorial, we will introduce some methods about how to build the data pipeline (i.e., data transformations)for your tasks. + +- [Customize Data Pipeline](#customize-data-pipeline) + - [Design of Dataset and Data pipelines](#design-of-dataset-and-data-pipelines) + - [Modify the training/test pipeline](#modify-the-training/test-pipeline) + - [Add new data transforms](#add-new-data-transforms) + +## Design of Data pipelines + +The data pipeline means how to process the sample dict when indexing a sample from the dataset. And it +consists of a sequence of data transforms. Each data transform takes a dict as input, processes it, and outputs a dict for the next data transform. + +Here is a data pipeline example for SlowFast training on Kinetics for `VideoDataset`. It first use [`decord`](https://github.com/dmlc/decord) to read the raw videos and randomly sample one video clip (the clip has 32 frames, and the interval between frames is 2). Next it applies the random resized crop and random horizontal flip to all frames. Finally the data shape is formatted as `NCTHW`. + +```python +train_pipeline = [ + dict(type='DecordInit',), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +``` + +All available data transforms in MMAction2 can be found in the [data transforms docs](mmaction.datasets.transforms). + +## Modify the training/test pipeline + +The data pipeline in MMAction2 is pretty flexible. You can control almost every step of the data +preprocessing from the config file, but on the other hand, you may be confused facing so many options. + +Here is a common practice and guidance for action recognition tasks. + +### Loading + +At the beginning of a data pipeline, we usually need to load videos. But if you already extract the frames, you should use `RawFrameDecode` and change the dataset type to `RawframeDataset`: + +```python +train_pipeline = [ + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] +``` + +If you want to load data from files with special formats or special locations, you can [implement a new loading +transform](#add-new-data-transforms) and add it at the beginning of the data pipeline. + +### Sampling frames and other processing + +During training and testing, we may have different strategies to sample frames from the video. + +For example, during testing of SlowFast, we sample multiple clips uniformly: + +```python +test_pipeline = [ + ... + dict( + type='SampleFrames', + clip_len=32, + frame_interval=2, + num_clips=10, + test_mode=True), + ... +] +``` + +In the above example, 10 clips of 32-frame video clips will be sampled for each video. We use `test_mode=True` to uniformly sample these clips (as opposed to randomly sample during training). + +Another example is that TSN/TSM models sample multiple segments from the video: + +```python +train_pipeline = [ + ... + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), + ... +] +``` + +```{note} +Usually, the data augmentation part in the data pipeline handles only video-wise transforms, but not transforms +like video normalization or mixup/cutmix. It's because we can do image normalization and mixup/cutmix on batch data +to accelerate with GPUs. To configure video normalization and mixup/cutmix, please use the [data preprocessor] +(mmaction.models.utils.data_preprocessor). +``` + +### Formatting + +The formatting is to collect training data from the data information dict and convert these data to +model-friendly format. + +In most cases, you can simply use [`PackActionInputs`](mmaction.datasets.transforms.PackActionInputs), and it will +convert the image in NumPy array format to PyTorch tensor, and pack the ground truth categories information and +other meta information as a dict-like object [`ActionDataSample`](mmaction.structures.ActionDataSample). + +```python +train_pipeline = [ + ... + dict(type='PackActionInputs'), +] +``` + +## Add new data transforms + +1. Write a new data transform in any file, e.g., `my_transform.py`, and place it in + the folder `mmaction/datasets/transforms/`. The data transform class needs to inherit + the [`mmcv.transforms.BaseTransform`](mmcv.transforms.BaseTransform) class and override + the `transform` method which takes a dict as input and returns a dict. + + ```python + from mmcv.transforms import BaseTransform + from mmaction.datasets import TRANSFORMS + + @TRANSFORMS.register_module() + class MyTransform(BaseTransform): + + def transform(self, results): + # Modify the data information dict `results`. + return results + ``` + +2. Import the new class in the `mmaction/datasets/transforms/__init__.py`. + + ```python + ... + from .my_transform import MyTransform + + __all__ = [ + ..., 'MyTransform' + ] + ``` + +3. Use it in config files. + + ```python + train_pipeline = [ + ... + dict(type='MyTransform'), + ... + ] + ``` From 6cc912ba414c1f3b6cdc8cb3ea310ac3bf44d004 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Thu, 6 Apr 2023 16:16:10 +0800 Subject: [PATCH 32/36] [doc] add doc for 1.x branch (#2312) --- .gitignore | 2 +- configs/detection/acrn/README.md | 4 +- configs/detection/lfb/README.md | 4 +- configs/detection/slowfast/README.md | 4 +- configs/detection/slowonly/README.md | 4 +- configs/localization/bmn/README.md | 4 +- configs/recognition/c2d/README.md | 4 +- configs/recognition/c3d/README.md | 4 +- configs/recognition/csn/README.md | 4 +- configs/recognition/i3d/README.md | 4 +- configs/recognition/mvit/README.md | 2 +- configs/recognition/omnisource/README.md | 4 +- configs/recognition/r2plus1d/README.md | 4 +- configs/recognition/slowfast/README.md | 4 +- configs/recognition/slowonly/README.md | 4 +- configs/recognition/swin/README.md | 4 +- configs/recognition/tanet/README.md | 4 +- configs/recognition/timesformer/README.md | 4 +- configs/recognition/tin/README.md | 4 +- configs/recognition/tpn/README.md | 4 +- configs/recognition/trn/README.md | 4 +- configs/recognition/tsm/README.md | 4 +- configs/recognition/tsn/README.md | 4 +- ...ed-r50_8xb32-1x1x3-100e_kinetics400-rgb.py | 15 +- configs/recognition/uniformer/README.md | 2 +- configs/recognition/uniformerv2/README.md | 2 +- configs/recognition/videomae/README.md | 2 +- configs/recognition/x3d/README.md | 2 +- configs/recognition_audio/resnet/README.md | 4 +- configs/skeleton/2s-agcn/README.md | 6 +- configs/skeleton/posec3d/README.md | 4 +- .../posec3d/custom_dataset_training.md | 2 +- configs/skeleton/stgcn/README.md | 6 +- configs/skeleton/stgcnpp/README.md | 6 +- docs/en/advanced_guides/customize_dataset.md | 122 ++++++++ docs/en/advanced_guides/customize_logging.md | 163 +++++++++++ docs/en/advanced_guides/customize_models.md | 1 + .../en/advanced_guides/customize_optimizer.md | 11 + docs/en/advanced_guides/customize_pipeline.md | 7 +- docs/en/advanced_guides/dataflow.md | 1 + docs/en/advanced_guides/depoly.md | 0 .../contribution_guide.md | 3 +- docs/en/{notes => get_started}/faq.md | 2 +- .../{ => get_started}/guide_to_framework.md | 0 .../installation.md} | 16 +- docs/en/get_started/overview.md | 97 +++++++ docs/en/get_started/quick_run.md | 221 +++++++++++++++ docs/en/index.rst | 60 ++-- docs/en/merge_docs.sh | 49 +++- docs/en/notes/{projects.md => ecosystem.md} | 2 +- docs/en/notes/pytorch2.0.md | 21 ++ docs/en/stat.py | 144 +++++----- docs/en/supported_datasets.md | 36 +++ docs/en/{user_guides => }/useful_tools.md | 4 +- docs/en/user_guides/2_data_prepare.md | 152 ---------- .../{3_inference.md => Inference.md} | 4 +- .../en/user_guides/{1_config.md => config.md} | 15 +- docs/en/user_guides/prepare_dataset.md | 263 ++++++++++++++++++ .../{4_train_test.md => train_test.md} | 2 +- docs/en/user_guides/visualization.md | 20 -- docs/zh_cn/index.rst | 2 +- docs/zh_cn/user_guides/3_inference.md | 2 +- src/pytorch-sphinx-theme | 1 + tools/visualizations/browse_dataset.py | 8 +- tools/visualizations/vis_scheduler.py | 115 ++++---- 65 files changed, 1264 insertions(+), 419 deletions(-) create mode 100644 docs/en/advanced_guides/customize_dataset.md create mode 100644 docs/en/advanced_guides/customize_logging.md create mode 100644 docs/en/advanced_guides/customize_models.md create mode 100644 docs/en/advanced_guides/dataflow.md create mode 100644 docs/en/advanced_guides/depoly.md rename docs/en/{notes => get_started}/contribution_guide.md (93%) rename docs/en/{notes => get_started}/faq.md (99%) rename docs/en/{ => get_started}/guide_to_framework.md (100%) rename docs/en/{get_started.md => get_started/installation.md} (95%) create mode 100644 docs/en/get_started/overview.md create mode 100644 docs/en/get_started/quick_run.md rename docs/en/notes/{projects.md => ecosystem.md} (98%) create mode 100644 docs/en/notes/pytorch2.0.md create mode 100644 docs/en/supported_datasets.md rename docs/en/{user_guides => }/useful_tools.md (98%) delete mode 100644 docs/en/user_guides/2_data_prepare.md rename docs/en/user_guides/{3_inference.md => Inference.md} (95%) rename docs/en/user_guides/{1_config.md => config.md} (98%) create mode 100644 docs/en/user_guides/prepare_dataset.md rename docs/en/user_guides/{4_train_test.md => train_test.md} (99%) delete mode 100644 docs/en/user_guides/visualization.md create mode 160000 src/pytorch-sphinx-theme diff --git a/.gitignore b/.gitignore index 3e40ace4d5..1d637fa156 100644 --- a/.gitignore +++ b/.gitignore @@ -65,7 +65,7 @@ instance/ .scrapy # Sphinx documentation -docs/_build/ +docs/*/_build/ # PyBuilder target/ diff --git a/configs/detection/acrn/README.md b/configs/detection/acrn/README.md index d08efb6d2d..054853c35a 100644 --- a/configs/detection/acrn/README.md +++ b/configs/detection/acrn/README.md @@ -49,7 +49,7 @@ python tools/train.py configs/detection/acrn/slowfast-acrn_kinetics400-pretraine --seed 0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -66,7 +66,7 @@ python tools/test.py configs/detection/acrn/slowfast-acrn_kinetics400-pretrained checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/detection/lfb/README.md b/configs/detection/lfb/README.md index dabb3a1b46..51af1377c8 100644 --- a/configs/detection/lfb/README.md +++ b/configs/detection/lfb/README.md @@ -76,7 +76,7 @@ python tools/train.py configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrain --validate --seed 0 --deterministic ``` -For more details and optional arguments infos, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details and optional arguments infos, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -101,7 +101,7 @@ python tools/test.py configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretraine checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/detection/slowfast/README.md b/configs/detection/slowfast/README.md index bae71fd040..f82273adcc 100644 --- a/configs/detection/slowfast/README.md +++ b/configs/detection/slowfast/README.md @@ -54,7 +54,7 @@ python tools/train.py configs/detection/slowfast/slowfast_kinetics400-pretrained --seed 0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -71,7 +71,7 @@ python tools/test.py configs/detection/slowfast/slowfast_kinetics400-pretrained- checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/detection/slowonly/README.md b/configs/detection/slowonly/README.md index e8af3d84ea..ff0f7bf641 100644 --- a/configs/detection/slowonly/README.md +++ b/configs/detection/slowonly/README.md @@ -75,7 +75,7 @@ python tools/train.py configs/detection/slowonly/slowonly_kinetics400-pretrained --seed 0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -92,7 +92,7 @@ python tools/test.py configs/detection/slowonly/slowonly_kinetics400-pretrained- checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/localization/bmn/README.md b/configs/localization/bmn/README.md index 2f49330743..ec2f625a95 100644 --- a/configs/localization/bmn/README.md +++ b/configs/localization/bmn/README.md @@ -42,7 +42,7 @@ Train BMN model on ActivityNet features dataset. bash tools/dist_train.sh configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py 2 ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -52,7 +52,7 @@ Test BMN on ActivityNet feature dataset. python3 tools/test.py configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py CHECKPOINT.PTH ``` -For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/c2d/README.md b/configs/recognition/c2d/README.md index 651193dad2..a1b58493f7 100644 --- a/configs/recognition/c2d/README.md +++ b/configs/recognition/c2d/README.md @@ -49,7 +49,7 @@ python tools/train.py configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_ --seed 0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -66,7 +66,7 @@ python tools/test.py configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_k checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/c3d/README.md b/configs/recognition/c3d/README.md index 958119f048..9e2af4229e 100644 --- a/configs/recognition/c3d/README.md +++ b/configs/recognition/c3d/README.md @@ -44,7 +44,7 @@ python tools/train.py configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1 --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -61,7 +61,7 @@ python tools/test.py configs/recognition/c3d_sports1m-pretrained_8xb30-16x1x1-45 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/csn/README.md b/configs/recognition/csn/README.md index 77c3aaf900..b09e365829 100644 --- a/configs/recognition/csn/README.md +++ b/configs/recognition/csn/README.md @@ -52,7 +52,7 @@ python tools/train.py configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12- --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -69,7 +69,7 @@ python tools/test.py configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-3 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/i3d/README.md b/configs/recognition/i3d/README.md index e181eaf195..a6e0aebccd 100644 --- a/configs/recognition/i3d/README.md +++ b/configs/recognition/i3d/README.md @@ -51,7 +51,7 @@ python tools/train.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-3 --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -68,7 +68,7 @@ python tools/test.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/mvit/README.md b/configs/recognition/mvit/README.md index 15f8723615..33527c8408 100644 --- a/configs/recognition/mvit/README.md +++ b/configs/recognition/mvit/README.md @@ -92,7 +92,7 @@ python tools/test.py configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/omnisource/README.md b/configs/recognition/omnisource/README.md index 64acf52c35..f3397d3bb1 100644 --- a/configs/recognition/omnisource/README.md +++ b/configs/recognition/omnisource/README.md @@ -47,7 +47,7 @@ python tools/train.py configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-25 We found that the training of this Omnisource model could crash for unknown reasons. If this happens, you can resume training by adding the `--cfg-options resume=True` to the training script. -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -64,7 +64,7 @@ python tools/test.py configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/r2plus1d/README.md b/configs/recognition/r2plus1d/README.md index 29a619e696..d9e216f41a 100644 --- a/configs/recognition/r2plus1d/README.md +++ b/configs/recognition/r2plus1d/README.md @@ -45,7 +45,7 @@ python tools/train.py configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_ --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -62,7 +62,7 @@ python tools/test.py configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_k checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/slowfast/README.md b/configs/recognition/slowfast/README.md index 3bf1666152..0cd2ccd8d3 100644 --- a/configs/recognition/slowfast/README.md +++ b/configs/recognition/slowfast/README.md @@ -48,7 +48,7 @@ python tools/train.py configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -65,7 +65,7 @@ python tools/test.py configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_ checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/slowonly/README.md b/configs/recognition/slowonly/README.md index bf5ce3781d..78a3e043e3 100644 --- a/configs/recognition/slowonly/README.md +++ b/configs/recognition/slowonly/README.md @@ -57,7 +57,7 @@ python tools/train.py configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256 --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -74,7 +74,7 @@ python tools/test.py configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/swin/README.md b/configs/recognition/swin/README.md index 1e6074c4a9..1156c4a679 100644 --- a/configs/recognition/swin/README.md +++ b/configs/recognition/swin/README.md @@ -55,7 +55,7 @@ python tools/train.py configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8 --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -72,7 +72,7 @@ python tools/test.py configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8- checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/tanet/README.md b/configs/recognition/tanet/README.md index 1a67a40aa0..a72a7bde4f 100644 --- a/configs/recognition/tanet/README.md +++ b/configs/recognition/tanet/README.md @@ -55,7 +55,7 @@ python tools/train.py configs/recognition/tanet/tanet_imagenet-pretrained-r50_8x --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -72,7 +72,7 @@ python tools/test.py configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/timesformer/README.md b/configs/recognition/timesformer/README.md index df197e0ba9..6d8e148bd8 100644 --- a/configs/recognition/timesformer/README.md +++ b/configs/recognition/timesformer/README.md @@ -47,7 +47,7 @@ python tools/train.py configs/recognition/timesformer/timesformer_divST_8xb8-8x3 --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -64,7 +64,7 @@ python tools/test.py configs/recognition/timesformer/timesformer_divST_8xb8-8x32 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/tin/README.md b/configs/recognition/tin/README.md index 17a30d7b03..abadd02f4f 100644 --- a/configs/recognition/tin/README.md +++ b/configs/recognition/tin/README.md @@ -67,7 +67,7 @@ python tools/train.py configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1 --work-dir work_dirs/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb randomness.seed=0 randomness.deterministic=True ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -84,7 +84,7 @@ python tools/test.py configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x checkpoints/SOME_CHECKPOINT.pth --dump result.json ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/tpn/README.md b/configs/recognition/tpn/README.md index 20a488ccb1..cb1af4b6b2 100644 --- a/configs/recognition/tpn/README.md +++ b/configs/recognition/tpn/README.md @@ -58,7 +58,7 @@ python tools/train.py configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_k --work-dir work_dirs/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb [--validate --seed 0 --deterministic] ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -75,7 +75,7 @@ python tools/test.py configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_ki checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/trn/README.md b/configs/recognition/trn/README.md index 875207dd43..323398acb4 100644 --- a/configs/recognition/trn/README.md +++ b/configs/recognition/trn/README.md @@ -52,7 +52,7 @@ python tools/train.py configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16- --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -69,7 +69,7 @@ python tools/test.py configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md index 5e5162de83..97c1b33e34 100644 --- a/configs/recognition/tsm/README.md +++ b/configs/recognition/tsm/README.md @@ -58,7 +58,7 @@ python tools/train.py configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16- --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -75,7 +75,7 @@ python tools/test.py configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/tsn/README.md b/configs/recognition/tsn/README.md index 1b6e34fdc1..61a65ace30 100644 --- a/configs/recognition/tsn/README.md +++ b/configs/recognition/tsn/README.md @@ -73,7 +73,7 @@ python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32- --seed=0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -90,7 +90,7 @@ python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py index 3bea4b9ca7..d48b403c02 100644 --- a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py +++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py @@ -58,7 +58,7 @@ ] train_dataloader = dict( - batch_size=32, + batch_size=4, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), @@ -99,4 +99,15 @@ # - `enable` means enable scaling LR automatically # or not by default. # - `base_batch_size` = (8 GPUs) x (32 samples per GPU). -auto_scale_lr = dict(enable=False, base_batch_size=256) +auto_scale_lr = dict(enable=True, base_batch_size=256) + +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=3) +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=10, + by_epoch=True, + milestones=[4, 8], + gamma=0.1) +] diff --git a/configs/recognition/uniformer/README.md b/configs/recognition/uniformer/README.md index 65c224ecc3..ff19fb4fb9 100644 --- a/configs/recognition/uniformer/README.md +++ b/configs/recognition/uniformer/README.md @@ -51,7 +51,7 @@ python tools/test.py configs/recognition/uniformer/uniformer-small_imagenet1k-pr checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/uniformerv2/README.md b/configs/recognition/uniformerv2/README.md index c69b69a662..73855f13f0 100644 --- a/configs/recognition/uniformerv2/README.md +++ b/configs/recognition/uniformerv2/README.md @@ -93,7 +93,7 @@ python tools/test.py configs/recognition/uniformerv2/uniformerv2-base-p16-res224 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/videomae/README.md b/configs/recognition/videomae/README.md index 65b353aff1..16cffc4840 100644 --- a/configs/recognition/videomae/README.md +++ b/configs/recognition/videomae/README.md @@ -47,7 +47,7 @@ python tools/test.py configs/recognition/videomae/vit-base-p16_videomae-k400-pre checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition/x3d/README.md b/configs/recognition/x3d/README.md index a0b9a6f3f4..88d4be33e5 100644 --- a/configs/recognition/x3d/README.md +++ b/configs/recognition/x3d/README.md @@ -47,7 +47,7 @@ python tools/test.py configs/recognition/x3d/x3d_s_13x6x1_facebook-kinetics400-r checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/recognition_audio/resnet/README.md b/configs/recognition_audio/resnet/README.md index be036d149e..f74f5c6ccc 100644 --- a/configs/recognition_audio/resnet/README.md +++ b/configs/recognition_audio/resnet/README.md @@ -46,7 +46,7 @@ python tools/train.py configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100 --cfg-options randomness.seed=0 randomness.deterministic=True ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -63,7 +63,7 @@ python tools/test.py configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/skeleton/2s-agcn/README.md b/configs/skeleton/2s-agcn/README.md index c61b6fe4e3..69ac0d5526 100644 --- a/configs/skeleton/2s-agcn/README.md +++ b/configs/skeleton/2s-agcn/README.md @@ -41,7 +41,7 @@ In skeleton-based action recognition, graph convolutional networks (GCNs), which | | four-stream | | | 90.89 | | | | | | | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size. -2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/user_guides/useful_tools.md#multi-stream-fusion). +2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/advanced_guides/useful_tools.md#multi-stream-fusion). ## Train @@ -58,7 +58,7 @@ python tools/train.py configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu6 --seed 0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -75,7 +75,7 @@ python tools/test.py configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/skeleton/posec3d/README.md b/configs/skeleton/posec3d/README.md index 0e45528345..93b526e5ac 100644 --- a/configs/skeleton/posec3d/README.md +++ b/configs/skeleton/posec3d/README.md @@ -101,7 +101,7 @@ python tools/train.py configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-k For training with your custom dataset, you can refer to [Custom Dataset Training](/configs/skeleton/posec3d/custom_dataset_training.md). -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -118,7 +118,7 @@ python tools/test.py configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-ke checkpoints/SOME_CHECKPOINT.pth ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/skeleton/posec3d/custom_dataset_training.md b/configs/skeleton/posec3d/custom_dataset_training.md index cb5b2f647f..81fc1cb3e1 100644 --- a/configs/skeleton/posec3d/custom_dataset_training.md +++ b/configs/skeleton/posec3d/custom_dataset_training.md @@ -2,7 +2,7 @@ We provide a step-by-step tutorial on how to train your custom dataset with PoseC3D. -1. First, you should know that action recognition with PoseC3D requires skeleton information only and for that you need to prepare your custom annotation files (for training and validation). To start with, you need to replace the placeholder `mmdet_root` and `mmpose_root` in `ntu_pose_extraction.py` with your installation path. Then you need to take advantage of [ntu_pose_extraction.py](https://github.com/open-mmlab/mmaction2/blob/90fc8440961987b7fe3ee99109e2c633c4e30158/tools/data/skeleton/ntu_pose_extraction.py) as shown in [Prepare Annotations](https://github.com/open-mmlab/mmaction2/blob/master/tools/data/skeleton/README.md#prepare-annotations) to extract 2D keypoints for each video in your custom dataset. The command looks like (assuming the name of your video is `some_video_from_my_dataset.mp4`): +1. First, you should know that action recognition with PoseC3D requires skeleton information only and for that you need to prepare your custom annotation files (for training and validation). To start with, you need to install MMDetection and MMPose. Then you need to take advantage of [ntu_pose_extraction.py](https://github.com/open-mmlab/mmaction2/blob/90fc8440961987b7fe3ee99109e2c633c4e30158/tools/data/skeleton/ntu_pose_extraction.py) as shown in [Prepare Annotations](https://github.com/open-mmlab/mmaction2/blob/master/tools/data/skeleton/README.md#prepare-annotations) to extract 2D keypoints for each video in your custom dataset. The command looks like (assuming the name of your video is `some_video_from_my_dataset.mp4`): ```shell # You can use the above command to generate pickle files for all of your training and validation videos. diff --git a/configs/skeleton/stgcn/README.md b/configs/skeleton/stgcn/README.md index dee9f46dfb..c8d23a1a05 100644 --- a/configs/skeleton/stgcn/README.md +++ b/configs/skeleton/stgcn/README.md @@ -63,7 +63,7 @@ Dynamics of human body skeletons convey significant information for human action | | four-stream | | | 86.19 | | | | | | | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size. -2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/user_guides/useful_tools.md#multi-stream-fusion). +2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/advanced_guides/useful_tools.md#multi-stream-fusion). ## Train @@ -80,7 +80,7 @@ python tools/train.py configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xs --seed 0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -97,7 +97,7 @@ python tools/test.py configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsu checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/configs/skeleton/stgcnpp/README.md b/configs/skeleton/stgcnpp/README.md index 655b067a60..3eec28036c 100644 --- a/configs/skeleton/stgcnpp/README.md +++ b/configs/skeleton/stgcnpp/README.md @@ -35,7 +35,7 @@ We present PYSKL: an open-source toolbox for skeleton-based action recognition b | | four-stream | | | 91.87 | | | | | | | 1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size. -2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/user_guides/useful_tools.md#multi-stream-fusion). +2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/advanced_guides/useful_tools.md#multi-stream-fusion). ## Train @@ -52,7 +52,7 @@ python tools/train.py configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu6 --seed 0 --deterministic ``` -For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test @@ -69,7 +69,7 @@ python tools/test.py configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60 checkpoints/SOME_CHECKPOINT.pth --dump result.pkl ``` -For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md). +For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Citation diff --git a/docs/en/advanced_guides/customize_dataset.md b/docs/en/advanced_guides/customize_dataset.md new file mode 100644 index 0000000000..31a6e16b2b --- /dev/null +++ b/docs/en/advanced_guides/customize_dataset.md @@ -0,0 +1,122 @@ +# Customize Datasets + +In this tutorial, we will introduce some methods about how to customize your own dataset by online conversion. + +- [Customize Datasets](#customize-datasets) + - [General understanding of the Dataset in MMAction2](#general-understanding-of-the-dataset-in-mmaction2) + - [Customize new datasets](#customize-new-datasets) + - [Customize keypoint format for PoseDataset](#customize-keypoint-format-for-posedataset) + +## General understanding of the Dataset in MMAction2 + +MMAction2 provides specific Dataset class according to the task, e.g. `VideoDataset`/`RawframeDataset` for action recognition, `AVADataset` for spatio-temporal action detection, `PoseDataset` for skeleton-based action recognition. All these specific datasets only need to implement `get_data_info(self, idx)` to build a data list from the annotation file, while other functions are handled by the superclass. The following table shows the inherent relationship and the main function of the modules. + +| Class Name | Functions | +| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| MMAction2::VideoDataset | `load_data_list(self)`
Build data list from the annotation file. | +| MMAction2::BaseActionDataset | `get_data_info(self, idx)`
Given the `idx`, return the corresponding data sample from data list | +| MMEngine::BaseDataset | `__getitem__(self, idx)`
Given the `idx`, call `get_data_info` to get data sample, then call the `pipeline` to perform transforms and augmentation in `train_pipeline` or `val_pipeline` | + +## Customize new datasets + +For most scenarios, we don't need to customize a new dataset class, offline conversion is recommended way to use your data. But customizing a new dataset class is also easy in MMAction2. As above mentioned, a dataset for a specific task usually only needs to implement `load_data_list(self)` to generate the data list from the annotation file. It is worth noting that elements in the `data_list` are `dict` with fields required in the following pipeline. + +Take `VideoDataset` as an example, `train_pipeline`/`val_pipeline` requires `'filename'` in `DecordInit` and `'label'` in `PackActionInput`, so data samples in the data list have 2 fields: `'filename'` and `'label'`. +you can refer to [customize pipeline](customize_pipeline.md) for more details about the pipeline. + +``` +data_list.append(dict(filename=filename, label=label)) +``` + +While `AVADataset` is more complex, elements in the data list consist of several fields about video data, and it further overwrites `get_data_info(self, idx)` to convert keys, which are required in spatio-temporal action detection pipeline. + +```python + +class AVADataset(BaseActionDataset): + ... + + def load_data_list(self) -> List[dict]: + ... + video_info = dict( + frame_dir=frame_dir, + video_id=video_id, + timestamp=int(timestamp), + img_key=img_key, + shot_info=shot_info, + fps=self._FPS, + ann=ann) + data_list.append(video_info) + data_list.append(video_info) + return data_list + + def get_data_info(self, idx: int) -> dict: + ... + ann = data_info.pop('ann') + data_info['gt_bboxes'] = ann['gt_bboxes'] + data_info['gt_labels'] = ann['gt_labels'] + data_info['entity_ids'] = ann['entity_ids'] + return data_info +``` + +## Customize keypoint format for PoseDataset + +MMAction2 currently supports three kinds of keypoint formats: `coco`, `nturgb+d` and `openpose`. If your use one of them, just specify the corresponding format in the following modules: + +For Graph Convolutional Networks, such as AAGCN, STGCN... + +- transform: argument `dataset` in `JointToBone`. +- backbone: argument `graph_cfg` in Graph Convolutional Networks. + +And for PoseC3D: + +- transform: In `Flip`, specify `left_kp` and `right_kp` according to the keypoint symmetrical relationship, or remove the transform for asymmetric keypoints structure. +- transform: In `GeneratePoseTarget`, specify `skeletons`, `left_limb`, `right_limb` if `with_limb` is `true`, and `left_kp`, `right_kp` if `with_kp` is `true`. + +For a custom format, you need to add a new graph layout into models and transforms, which defines the keypoints and their connection relationship. + +Take the coco dataset as an example, we define a layout named `coco` in `Graph`, and set its `inward` as followed, which includes all connections between nodes, each connection is a pair of nodes from far to near. The order of connections does not matter. Other settings about coco are to set the number of nodes to 17, and set node 0 as the center node. + +```python + +self.num_node = 17 +self.inward = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 5), + (12, 6), (9, 7), (7, 5), (10, 8), (8, 6), (5, 0), + (6, 0), (1, 0), (3, 1), (2, 0), (4, 2)] +self.center = 0 +``` + +Similarly, we define the `pairs` in `JointToBone`, adding a bone of `(0, 0)` to align the number of bones to the nodes. The `pairs` of coco dataset is as followed, same as above mentioned, the order of pairs does not matter. + +```python + +self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 0), + (6, 0), (7, 5), (8, 6), (9, 7), (10, 8), (11, 0), + (12, 0), (13, 11), (14, 12), (15, 13), (16, 14)) +``` + +For your custom format, just define the above setting as your graph structure, and specify in your config file as followed, we take `STGCN` as an example, assuming you already define a `custom_dataset` in `Graph` and `JointToBone`, and num_classes is n. + +```python + +model = dict( + type='RecognizerGCN', + backbone=dict( + type='STGCN', graph_cfg=dict(layout='custom_dataset', mode='stgcn_spatial')), + cls_head=dict(type='GCNHead', num_classes=n, in_channels=256)) + +train_pipeline = [ + ... + dict(type='GenSkeFeat', dataset='custom_dataset'), + ...] + +val_pipeline = [ + ... + dict(type='GenSkeFeat', dataset='custom_dataset'), + ...] + +test_pipeline = [ + ... + dict(type='GenSkeFeat', dataset='custom_dataset'), + ...] + +``` diff --git a/docs/en/advanced_guides/customize_logging.md b/docs/en/advanced_guides/customize_logging.md new file mode 100644 index 0000000000..aabaad949f --- /dev/null +++ b/docs/en/advanced_guides/customize_logging.md @@ -0,0 +1,163 @@ +# Customize Logging + +MMAction2 produces a lot of logs during the running process, such as loss, iteration time, learning rate, etc. In this section, we will introduce you how to output custom log. More details about the logging system, please refer to [MMEngine](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/logging.html). + +- [Customize Logging](#customize-logging) + - [Flexible Logging System](#flexible-logging-system) + - [Customize log](#customize-log) + - [Export the debug log](#export-the-debug-log) + +## Flexible Logging System + +MMAction2 configures the logging system by LogProcessor in [default_runtime](/configs/_base_/default_runtime.py) in default, which is equivalent to: + +```python +log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True) +``` + +Defaultly, LogProcessor catches all filed start with `loss` return by `model.forward`. For example in the following model, `loss1` and `loss2` will be logged automatically without additional configuration. + +```python +from mmengine.model import BaseModel + +class ToyModel(BaseModel): + def __init__(self) -> None: + super().__init__() + self.linear = nn.Linear(1, 1) + + def forward(self, img, label, mode): + feat = self.linear(img) + loss1 = (feat - label).pow(2) + loss2 = (feat - label).abs() + return dict(loss1=loss1, loss2=loss2) +``` + +The format of the output log is as followed: + +``` +08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][10/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0019 data_time: 0.0004 loss1: 0.8381 loss2: 0.9007 loss: 1.7388 +08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][20/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0029 data_time: 0.0010 loss1: 0.1978 loss2: 0.4312 loss: 0.6290 +``` + +LogProcessor will output the log in the following format: + +- The prefix of the log: + - epoch mode(`by_epoch=True`): `Epoch(train) [{current_epoch}/{current_iteration}]/{dataloader_length}` + - iteration mode(`by_epoch=False`): `Iter(train) [{current_iteration}/{max_iteration}]` +- Learning rate (`lr`): The learning rate of the last iteration. +- Time: + - `time`: The averaged time for inference of the last `window_size` iterations. + - `data_time`: The averaged time for loading data of the last `window_size` iterations. + - `eta`: The estimated time of arrival to finish the training. +- Loss: The averaged loss output by model of the last `window_size` iterations. + +```{warning} +log_processor outputs the epoch based log by default(`by_epoch=True`). To get an expected log matched with the `train_cfg`, we should set the same value for `by_epoch` in `train_cfg` and `log_processor`. +``` + +Based on the rules above, the code snippet will count the average value of the loss1 and the loss2 every 20 iterations. More types of statistical methods, please refer to [MMEngine.LogProcessor](mmengine.runner.LogProcessor). + +## Customize log + +The logging system could not only log the loss, lr, .etc but also collect and output the custom log. For example, if we want to statistic the intermediate loss: + +The `ToyModel` calculate `loss_tmp` in forward, but don't save it into the return dict. + +```python +from mmengine.logging import MessageHub + +class ToyModel(BaseModel): + + def __init__(self) -> None: + super().__init__() + self.linear = nn.Linear(1, 1) + + def forward(self, img, label, mode): + feat = self.linear(img) + loss_tmp = (feat - label).abs() + loss = loss_tmp.pow(2) + + message_hub = MessageHub.get_current_instance() + # update the intermediate `loss_tmp` in the message hub + message_hub.update_scalar('train/loss_tmp', loss_tmp.sum()) + return dict(loss=loss) +``` + +Add the `loss_tmp` into the config: + +```python +log_processor = dict( + type='LogProcessor', + window_size=20, + by_epoch=True, + custom_cfg=[ + # statistic the loss_tmp with the averaged value + dict( + data_src='loss_tmp', + window_size=20, + method_name='mean') + ]) +``` + +The `loss_tmp` will be added to the output log: + +``` +08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][10/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0026 data_time: 0.0008 loss_tmp: 0.0097 loss: 0.0000 +08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][20/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0028 data_time: 0.0013 loss_tmp: 0.0065 loss: 0.0000 +``` + +## Export the debug log + +To export the debug log to the `work_dir`, you can set log_level in config file as followed: + +``` +log_level='DEBUG' +``` + +``` +08/21 18:16:22 - mmengine - DEBUG - Get class `LocalVisBackend` from "vis_backend" registry in "mmengine" +08/21 18:16:22 - mmengine - DEBUG - An `LocalVisBackend` instance is built from registry, its implementation can be found in mmengine.visualization.vis_backend +08/21 18:16:22 - mmengine - DEBUG - Get class `RuntimeInfoHook` from "hook" registry in "mmengine" +08/21 18:16:22 - mmengine - DEBUG - An `RuntimeInfoHook` instance is built from registry, its implementation can be found in mmengine.hooks.runtime_info_hook +08/21 18:16:22 - mmengine - DEBUG - Get class `IterTimerHook` from "hook" registry in "mmengine" +... +``` + +Besides, logs of different ranks will be saved in `debug` mode if you are training your model with the shared storage. The hierarchy of the log is as follows: + +```text +./tmp +├── tmp.log +├── tmp_rank1.log +├── tmp_rank2.log +├── tmp_rank3.log +├── tmp_rank4.log +├── tmp_rank5.log +├── tmp_rank6.log +└── tmp_rank7.log +... +└── tmp_rank63.log +``` + +The log of Multiple machines with independent storage: + +```text +# device: 0: +work_dir/ +└── exp_name_logs + ├── exp_name.log + ├── exp_name_rank1.log + ├── exp_name_rank2.log + ├── exp_name_rank3.log + ... + └── exp_name_rank7.log + +# device: 7: +work_dir/ +└── exp_name_logs + ├── exp_name_rank56.log + ├── exp_name_rank57.log + ├── exp_name_rank58.log + ... + └── exp_name_rank63.log +``` diff --git a/docs/en/advanced_guides/customize_models.md b/docs/en/advanced_guides/customize_models.md new file mode 100644 index 0000000000..3d8c0e1d4e --- /dev/null +++ b/docs/en/advanced_guides/customize_models.md @@ -0,0 +1 @@ +# Customize Models diff --git a/docs/en/advanced_guides/customize_optimizer.md b/docs/en/advanced_guides/customize_optimizer.md index d69aa0ff90..d862b9632c 100644 --- a/docs/en/advanced_guides/customize_optimizer.md +++ b/docs/en/advanced_guides/customize_optimizer.md @@ -4,8 +4,19 @@ In this tutorial, we will introduce some methods about how to build the optimize - [Customize Optimizer](#customize-optimizer) - [Build optimizers using optim_wrapper](#build-optimizers-using-optim_wrapper) + - [Use optimizers supported by PyTorch](#use-optimizers-supported-by-pytorch) + - [Parameter-wise finely configuration](#parameter-wise-finely-configuration) + - [Gradient clipping](#gradient-clipping) + - [Gradient accumulation](#gradient-accumulation) - [Customize parameter schedules](#customize-parameter-schedules) + - [Customize learning rate schedules](#customize-learning-rate-schedules) + - [Customize momentum schedules](#customize-momentum-schedules) - [Add new optimizers or constructors](#add-new-optimizers-or-constructors) + - [Add new optimizers](#add-new-optimizers) + - [1. Implement a new optimizer](#1-implement-a-new-optimizer) + - [2. Import the optimizer](#2-import-the-optimizer) + - [3. Specify the optimizer in the config file](#3-specify-the-optimizer-in-the-config-file) + - [Add new optimizer constructors](#add-new-optimizer-constructors) ## Build optimizers using optim_wrapper diff --git a/docs/en/advanced_guides/customize_pipeline.md b/docs/en/advanced_guides/customize_pipeline.md index 719f806d3f..632216ba10 100644 --- a/docs/en/advanced_guides/customize_pipeline.md +++ b/docs/en/advanced_guides/customize_pipeline.md @@ -3,8 +3,11 @@ In this tutorial, we will introduce some methods about how to build the data pipeline (i.e., data transformations)for your tasks. - [Customize Data Pipeline](#customize-data-pipeline) - - [Design of Dataset and Data pipelines](#design-of-dataset-and-data-pipelines) - - [Modify the training/test pipeline](#modify-the-training/test-pipeline) + - [Design of Data pipelines](#design-of-data-pipelines) + - [Modify the training/test pipeline](#modify-the-trainingtest-pipeline) + - [Loading](#loading) + - [Sampling frames and other processing](#sampling-frames-and-other-processing) + - [Formatting](#formatting) - [Add new data transforms](#add-new-data-transforms) ## Design of Data pipelines diff --git a/docs/en/advanced_guides/dataflow.md b/docs/en/advanced_guides/dataflow.md new file mode 100644 index 0000000000..0cc136162a --- /dev/null +++ b/docs/en/advanced_guides/dataflow.md @@ -0,0 +1 @@ +# Dataflow in MMAction2 diff --git a/docs/en/advanced_guides/depoly.md b/docs/en/advanced_guides/depoly.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/en/notes/contribution_guide.md b/docs/en/get_started/contribution_guide.md similarity index 93% rename from docs/en/notes/contribution_guide.md rename to docs/en/get_started/contribution_guide.md index f9d96c75a5..02f2aa35d4 100644 --- a/docs/en/notes/contribution_guide.md +++ b/docs/en/get_started/contribution_guide.md @@ -1,10 +1,11 @@ -# Contributing to MMAction2 +# How to contribute to MMAction2 All kinds of contributions are welcome, including but not limited to the following. - Fixes (typo, bugs) - New features and components - Add documentation or translate the documentation into other languages +- Add new project (Recommended) about video understanding algorithm with less restriction, refer to [here](/projects/README.md) for details ## Workflow diff --git a/docs/en/notes/faq.md b/docs/en/get_started/faq.md similarity index 99% rename from docs/en/notes/faq.md rename to docs/en/get_started/faq.md index 4f028d5b4c..2cbe7787b3 100644 --- a/docs/en/notes/faq.md +++ b/docs/en/get_started/faq.md @@ -88,7 +88,7 @@ If the contents here do not cover your issue, please create an issue using the [ - **How to set `load_from` value in config files to finetune models?** - In MMAction2, We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](/docs/en/user_guides/1_config.md), + In MMAction2, We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](/docs/en/user_guides/config.md), users can directly change it by setting `load_from` in their configs. ## Testing diff --git a/docs/en/guide_to_framework.md b/docs/en/get_started/guide_to_framework.md similarity index 100% rename from docs/en/guide_to_framework.md rename to docs/en/get_started/guide_to_framework.md diff --git a/docs/en/get_started.md b/docs/en/get_started/installation.md similarity index 95% rename from docs/en/get_started.md rename to docs/en/get_started/installation.md index 0f0ac1c5ec..9d48be6030 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started/installation.md @@ -1,4 +1,6 @@ -# Prerequisites +# Installation + +## Prerequisites In this section we demonstrate how to prepare an environment with PyTorch. @@ -35,12 +37,10 @@ On CPU platforms: conda install pytorch torchvision cpuonly -c pytorch ``` -# Installation +## Best Practices We recommend that users follow our best practices to install MMAction2. However, the whole process is highly customizable. See [Customize Installation](#customize-installation) section for more information. -## Best Practices - **Step 1.** Install [MMEngine](https://github.com/open-mmlab/mmengine) and [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim). ```shell @@ -52,10 +52,10 @@ mim install mmengine 'mmcv>=2.0.0rc1' According to your needs, we support two install modes: -- [Install from source (Recommended)](#install-from-source): You want to develop your own action recognition task or new features on MMAction2 framework. For example, adding new dataset or new models. Thus, you can use all tools we provided. +- [Install from source (Recommended)](#build-mmaction2-from-source): You want to develop your own action recognition task or new features on MMAction2 framework. For example, adding new dataset or new models. Thus, you can use all tools we provided. - [Install as a Python package](#install-as-a-python-package): You just want to call MMAction2's APIs or import MMAction2's modules in your project. -### Install from source +### Build MMAction2 from source In this case, install mmaction2 from source: @@ -193,3 +193,7 @@ Run it with ```shell docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmaction2/data mmaction2 ``` + +## Troubleshooting + +coming soon... diff --git a/docs/en/get_started/overview.md b/docs/en/get_started/overview.md new file mode 100644 index 0000000000..4857991711 --- /dev/null +++ b/docs/en/get_started/overview.md @@ -0,0 +1,97 @@ +# Overview + +## What is MMAction2 + +MMAction2 is an open source toolkit based on PyTorch, supporting numerous video understanding models, including action recognition, skeleton-based action recognition, spatio-temporal action detection and temporal action localization. In addition, it supports widely-used academic datasets and provides many useful tools, assisting users in exploring various aspects of models and datasets and implementing high-quality algorithms. Generally, it has the following features. + +One-stop, Multi-model: MMAction2 supports various video understanding tasks and implements the latest models for action recognition, localization, detection. + +Modular Design: MMAction2’s modular design allows users to define and reuse modules in the model on demand. + +Various Useful Tools: MMAction2 provides many analysis tools, including visualizers, validation scripts, evaluators, etc., to help users troubleshoot, finetune or compare models. + +Powered by OpenMMLab: Like other algorithm libraries in OpenMMLab family, MMAction2 follows OpenMMLab’s rigorous development guidelines and interface conventions, significantly reducing the learning cost of users familiar with other projects in OpenMMLab family. In addition, benefiting from the unified interfaces among OpenMMLab, you can easily call the models implemented in other OpenMMLab projects (e.g. MMClassification) in MMAction2, facilitating cross-domain research and real-world applications. + + + + +
+

Action Recognition


+

Skeleton-based Action Recognition

+ + + +
+

Spatio-Temporal Action Detection


+

Spatio-Temporal Action Detection

+ +## How to use the documentation + +We have prepared a wealth of documents to meet your various needs: + +
+For the basic usage of MMAction2 + +- [Installation](docs/en/get_started/installation.md) +- [Quick Run](docs/en/get_started/quick_run.md) +- [Inference](docs/en/user_guides/Inference.md) + +
+ +
+For training on supported dataset + +- [learn about configs](docs/en/user_guides/config.md) +- [prepare dataset](docs/en/get_started/prepare_dataset.md) +- [Training and testing](docs/en/user_guides/train_test.md) + +
+ +
+For looking for some common issues + +- [FAQs](docs/en/get_started/faq.md) +- [Useful tools](docs/en/useful_tools.md) + +
+ +
+For a general understanding about MMAction2 + +- [20-minute tour to MMAction2](docs/en/get_started/20-minute_tour.md) +- [Data flow in MMAction2](docs/en/advanced_guides/dataflow.md) + +
+ +
+For advanced usage about custom training + +- [Customize models](docs/en/advanced_guides/customize_models.md) +- [Customize datasets](docs/en/advanced_guides/customize_dataset.md) +- [Customize data transformation and augmentation](docs/en/advanced_guides/customize_pipeline.md) +- [Customize optimizer and scheduler](docs/en/advanced_guides/customize_optimizer.md) +- [Customize logging](docs/en/advanced_guides/customize_logging.md) + +
+ +
+For supported model zoo and dataset zoo + +- [Model Zoo](model_zoo/modelzoo.md) +- [Dataset Zoo](datasetzoo.md) + +
+ +
+For migration from MMAction2 0.x + +- [Migration](migration.md) + +
+ +
+For researchers and developers who are willing to contribute to MMAction2 + +- [Contribution Guide](get_started/contribution_guide.md) + +
diff --git a/docs/en/get_started/quick_run.md b/docs/en/get_started/quick_run.md new file mode 100644 index 0000000000..84ae5b985f --- /dev/null +++ b/docs/en/get_started/quick_run.md @@ -0,0 +1,221 @@ +# Quick Run + +This chapter will take you through the basic functions of MMAction2. And we assume you [installed MMAction2 from source](../installation#best-practices). + +- [Quick Run](#quick-run) + - [Inference](#inference) + - [Prepare a Dataset](#prepare-a-dataset) + - [Modify the Config](#modify-the-config) + - [Modify Dataset](#modify-dataset) + - [Modify Runtime Config](#modify-runtime-config) + - [Modify Model Config](#modify-model-config) + - [Browse the Dataset](#browse-the-dataset) + - [Training](#training) + - [Testing](#testing) + +## Inference + +Run the following in MMAction2's root directory: + +```shell +python demo/demo_inferencer.py demo/demo.mp4 \ + --rec tsn --print-result \ + --label-file tools/data/kinetics/label_map_k400.txt +``` + +You should be able to see a pop-up video and the inference result printed out in the console. + +
+ +
+
+ +```bash +# Inference result +{'predictions': [{'rec_labels': [[6]], 'rec_scores': [[...]]}]} +``` + +```{note} +If you are running MMAction2 on a server without GUI or via SSH tunnel with X11 forwarding disabled, you may not see the pop-up window. +``` + +A detailed description of MMAction2's inference interface can be found [here](/demo/README#inferencer) + +In addition to using our well-provided pre-trained models, you can also train models on your own datasets. In the next section, we will take you through the basic functions of MMAction2 by training TSN on the tiny [Kinetics](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) dataset as an example. + +## Prepare a Dataset + +Since the variety of video dataset formats are not conducive to switching datasets, MMAction2 proposes a uniform [data format](../user_guides/2_data_prepare.md), and provides [dataset preparer](../user_guides/data_prepare/dataset_preparer.md) for commonly used video datasets. Usually, to use those datasets in MMAction2, you just need to follow the steps to get them ready for use. + +```{note} +But here, efficiency means everything. +``` + +Here, we have prepared a lite version of Kinetics dataset for demonstration purposes. Download our pre-prepared [zip](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) and extract it to the `data/` directory under mmaction2 to get our prepared video and annotation file. + +```Bash +wget https://download.openmmlab.com/mmaction/kinetics400_tiny.zip +mkdir -p data/ +unzip kinetics400_tiny.zip -d data/ +``` + +## Modify the Config + +Once the dataset is prepared, we will then specify the location of the training set and the training parameters by modifying the config file. + +In this example, we will train a TSN using resnet50 as its backbone. Since MMAction2 already has a config file for the full Kinetics400 dataset (`configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`), we just need to make some modifications on top of it. + +### Modify Dataset + +We first need to modify the path to the dataset. Open `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` and replace keys as followed: + +```Python +data_root = 'data/kinetics400_tiny/train' +data_root_val = 'data/kinetics400_tiny/val' +ann_file_train = 'data/kinetics400_tiny/kinetics_tiny_train_video.txt' +ann_file_val = 'data/kinetics400_tiny/kinetics_tiny_val_video.txt' +``` + +### Modify Runtime Config + +Also, because of the reduced dataset size, we'd better reduce training batchsize to 4 and the number of training epochs to 10 accordingly, shorten the validation interval as well as the weight storage interval to 1 rounds, and modify the learning rate decay strategy. Modify corresponding keys in `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` as following lines to take effect. + +```Python +# set training batch size to 4 +train_dataloader['batch_size'] = 4 + +# Save checkpoints every epoch, and only keep the latest checkpoint +default_hooks = dict( + checkpoint=dict(type='CheckpointHook', interval=3, max_keep_ckpts=1,), + ) +# Set the maximum number of epochs to 10, and validate the model every 3 epochs +train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=3) +# adjust learning rate schedule according to 10 epochs +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=10, + by_epoch=True, + milestones=[4, 8], + gamma=0.1) +] +``` + +### Modify Model Config + +Further, due to the small size of tiny kinetics dataset, we'd better to load a pre-trained model on original Kinetics dataset. We also need to modify the model according to the actual number of classes. Just directly put the following lines into `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`. + +```Python + +model = dict( + cls_head=dict(num_classes=2)) +load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth' +``` + +Here, we have rewritten the corresponding parameters in the base configuration directly through the inheritance ({external+mmengine:doc}`MMEngine: Config `) mechanism of the config. The original fields are distributed in `configs/_base_/models/tsn_r50.py`, `configs/_base_/schedules/sgd_100e.py` and `configs/_base_/default_runtime.py`. + +```{note} +For a more detailed description of config, please refer to [here](../user_guides/1_config.md). +``` + +## Browse the Dataset + +Before we start the training, we can also visualize the frames processed by training-time [data transforms](<>). It's quite simple: pass the config file we need to visualize into the [browse_dataset.py](/tools/analysis_tools/browse_dataset.py) script. + +```Bash +python tools/visualizations/browse_dataset.py \ + configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \ + browse_out --mode pipeline +``` + +The transformed videos will be saved to `browse_out` folder. + +
+ +
+ +```{note} +For details on the parameters and usage of this script, please refer to [here](../user_guides/useful_tools.md). +``` + +```{tip} +In addition to satisfying our curiosity, visualization can also help us check the parts that may affect the model's performance before training, such as problems in configs, datasets and data transforms. +``` + +we can further visualize the learning rate schedule to make sure that the config is as expected by following script: + +```Bash +python tools/visualizations/vis_scheduler.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py +``` + +The training learning rate schedule will be displayed in a pop-up window. + +
+ +
+ +```{note} +The learning rate is auto scaled according to the actual batchsize. +``` + +## Training + +Start the training by running the following command: + +```Bash +python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py +``` + +Depending on the system environment, MMAction2 will automatically use the best device for training. If a GPU is available, a single GPU training will be started by default. When you start to see the output of the losses, you have successfully started the training. + +```Bash +03/24 16:36:15 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608 +03/24 16:36:15 - mmengine - INFO - Epoch(train) [1][8/8] lr: 1.5625e-04 eta: 0:00:15 time: 0.2151 data_time: 0.0845 memory: 1314 grad_norm: 8.5647 loss: 0.7267 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.7267 +03/24 16:36:16 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608 +03/24 16:36:16 - mmengine - INFO - Epoch(train) [2][8/8] lr: 1.5625e-04 eta: 0:00:12 time: 0.1979 data_time: 0.0717 memory: 1314 grad_norm: 8.4709 loss: 0.7130 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.7130 +03/24 16:36:18 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608 +03/24 16:36:18 - mmengine - INFO - Epoch(train) [3][8/8] lr: 1.5625e-04 eta: 0:00:10 time: 0.1691 data_time: 0.0478 memory: 1314 grad_norm: 8.2910 loss: 0.6900 top1_acc: 0.5000 top5_acc: 1.0000 loss_cls: 0.6900 +03/24 16:36:18 - mmengine - INFO - Saving checkpoint at 3 epochs +03/24 16:36:19 - mmengine - INFO - Epoch(val) [3][1/1] acc/top1: 0.9000 acc/top5: 1.0000 acc/mean1: 0.9000data_time: 1.2716 time: 1.3658 +03/24 16:36:20 - mmengine - INFO - The best checkpoint with 0.9000 acc/top1 at 3 epoch is saved to best_acc/top1_epoch_3.pth. +``` + +Without extra configurations, model weights will be saved to `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/`, while the logs will be stored in `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/TIMESTAMP/`. Next, we just need to wait with some patience for training to finish. + +```{note} +For advanced usage of training, such as CPU training, multi-GPU training, and cluster training, please refer to [Training and Testing](../user_guides/train_test.md). +``` + +## Testing + +After 10 epochs, we observe that TSN performs best in the 6th epoch, with `acc/top1` reaching 1.0000: + +```Bash +03/24 16:36:25 - mmengine - INFO - Epoch(val) [6][1/1] acc/top1: 1.0000 acc/top5: 1.0000 acc/mean1: 1.0000data_time: 1.0210 time: 1.1091 +``` + +```{note} +The result is pretty high due to pre-trained on original Kinetics400, you may see a different result. +``` + +However, this value only reflects the validation performance of TSN on the mini Kinetics dataset, While test results are usually higher due to more augmentation in test pipeline. + +Start testing: + +```Bash +python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \ + work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/best_acc/top1_epoch_6.pth +``` + +And get the outputs like: + +```Bash +03/24 17:00:59 - mmengine - INFO - Epoch(test) [10/10] acc/top1: 1.0000 acc/top5: 1.0000 acc/mean1: 0.9000data_time: 0.0420 time: 1.0795 +``` + +The model achieves an hmean of 1.0000 on this dataset. + +```{note} +For advanced usage of testing, such as CPU testing, multi-GPU testing, and cluster testing, please refer to [Training and Testing](../user_guides/train_test.md). +``` diff --git a/docs/en/index.rst b/docs/en/index.rst index 392b64ef45..73a4590f00 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -7,24 +7,38 @@ You can switch between Chinese and English documents in the lower-left corner of :maxdepth: 1 :caption: Get Started - get_started.md - guide_to_framework.md + get_started/overview.md + get_started/installation.md + get_started/quick_run.md + get_started/guide_to_framework.md + get_started/contribution_guide.md + get_started/faq.md .. toctree:: :maxdepth: 1 :caption: User Guides - user_guides/1_config.md - user_guides/2_data_prepare.md - user_guides/3_inference.md - user_guides/4_train_test.md + user_guides/Inference.md + user_guides/config.md + user_guides/train_test.md + user_guides/prepare_dataset.md .. toctree:: :maxdepth: 1 - :caption: Useful Tools + :caption: Advanced Guides - user_guides/useful_tools.md - user_guides/visualization.md + advanced_guides/dataflow.md + advanced_guides/customize_models.md + advanced_guides/customize_dataset.md + advanced_guides/customize_pipeline.md + advanced_guides/customize_optimizer.md + advanced_guides/customize_logging.md + advanced_guides/deploy.md + +.. toctree:: + :maxdepth: 1 + :caption: Advanced Guides + useful_tools.md .. toctree:: :maxdepth: 1 @@ -36,20 +50,32 @@ You can switch between Chinese and English documents in the lower-left corner of :maxdepth: 1 :caption: Model Zoo - modelzoo.md - recognition_models.md - detection_models.md - skeleton_models.md - localization_models.md + model_zoo/modelzoo.md + model_zoo/recognition_models.md + model_zoo/detection_models.md + model_zoo/skeleton_models.md + model_zoo/localization_models.md + + +.. toctree:: + :maxdepth: 1 + :caption: Dataset Zoo + + datasetzoo_overview.md + datasetzoo.md + +.. toctree:: + :maxdepth: 1 + :caption: Projects + + projectzoo.md .. toctree:: :maxdepth: 1 :caption: Notes - notes/contribution_guide.md - notes/projects.md + notes/ecosystem.md notes/changelog.md - notes/faq.md .. toctree:: :caption: Switch Language diff --git a/docs/en/merge_docs.sh b/docs/en/merge_docs.sh index aa2a9bebfd..5a3c86b7ac 100644 --- a/docs/en/merge_docs.sh +++ b/docs/en/merge_docs.sh @@ -1,8 +1,45 @@ #!/usr/bin/env bash -## gather models -cat ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > localization_models.md -cat ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > recognition_models.md -cat ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >> recognition_models.md -cat ../../configs/detection/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > detection_models.md -cat ../../configs/skeleton/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > skeleton_models.md +# gather models +mkdir -p model_zoo +cat ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\/en/](../g' |sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/localization_models.md +cat ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/recognition_models.md +cat ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" >> model_zoo/recognition_models.md +cat ../../configs/detection/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/detection_models.md +cat ../../configs/skeleton/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/skeleton_models.md + +# gather projects +# TODO: generate table of contents for project zoo +cat ../../projects/README.md > projectzoo.md +cat ../../projects/*/README.md >> projectzoo.md + +# gather datasets +cat supported_datasets.md > datasetzoo.md +cat ../../tools/data/*/README.md | sed 's/# Preparing/# /g' | sed 's/#/#&/' >> datasetzoo.md + +sed -i 's/(\/tools\/data\/activitynet\/README.md/(#activitynet/g' datasetzoo.md +sed -i 's/(\/tools\/data\/kinetics\/README.md/(#kinetics-400600700/g' datasetzoo.md +sed -i 's/(\/tools\/data\/mit\/README.md/(#moments-in-time/g' datasetzoo.md +sed -i 's/(\/tools\/data\/mmit\/README.md/(#multi-moments-in-time/g' datasetzoo.md +sed -i 's/(\/tools\/data\/sthv1\/README.md/(#something-something-v1/g' datasetzoo.md +sed -i 's/(\/tools\/data\/sthv2\/README.md/(#something-something-v2/g' datasetzoo.md +sed -i "s/(\/tools\/data\/thumos14\/README.md/(#thumos14/g" datasetzoo.md +sed -i 's/(\/tools\/data\/ucf101\/README.md/(#ucf-101/g' datasetzoo.md +sed -i 's/(\/tools\/data\/ucf101_24\/README.md/(#ucf101-24/g' datasetzoo.md +sed -i 's/(\/tools\/data\/jhmdb\/README.md/(#jhmdb/g' datasetzoo.md +sed -i 's/(\/tools\/data\/hvu\/README.md/(#hvu/g' datasetzoo.md +sed -i 's/(\/tools\/data\/hmdb51\/README.md/(#hmdb51/g' datasetzoo.md +sed -i 's/(\/tools\/data\/jester\/README.md/(#jester/g' datasetzoo.md +sed -i 's/(\/tools\/data\/ava\/README.md/(#ava/g' datasetzoo.md +sed -i 's/(\/tools\/data\/gym\/README.md/(#gym/g' datasetzoo.md +sed -i 's/(\/tools\/data\/omnisource\/README.md/(#omnisource/g' datasetzoo.md +sed -i 's/(\/tools\/data\/diving48\/README.md/(#diving48/g' datasetzoo.md +sed -i 's/(\/tools\/data\/skeleton\/README.md/(#skeleton-dataset/g' datasetzoo.md + +cat prepare_data.md >> datasetzoo.md + +sed -i 's/](\/docs\/en\//](g' datasetzoo.md +sed -i 's/](\/docs\/en\//](g' changelog.md + +sed -i 's/](\/docs\/en\//](..g' ./get_stated/*.md +sed -i 's/](\/docs\/en\//](..g' ./tutorials/*.md diff --git a/docs/en/notes/projects.md b/docs/en/notes/ecosystem.md similarity index 98% rename from docs/en/notes/projects.md rename to docs/en/notes/ecosystem.md index f4bc5ac9e6..73b0fd6aaf 100644 --- a/docs/en/notes/projects.md +++ b/docs/en/notes/ecosystem.md @@ -1,4 +1,4 @@ -# Projects based on MMAction2 +# Ecosystem Projects based on MMAction2 There are many research works and projects built on MMAction2. We list some of them as examples of how to extend MMAction2 for your own projects. diff --git a/docs/en/notes/pytorch2.0.md b/docs/en/notes/pytorch2.0.md new file mode 100644 index 0000000000..d50101490b --- /dev/null +++ b/docs/en/notes/pytorch2.0.md @@ -0,0 +1,21 @@ +# PyTorch 2.0 Compatibility and Benchmark + +PyTorch introduced `torch.compile` in its 2.0 release. It compiles your model to speedup trainning & validation. We provide a benchmark result and compatibility of typical models in MMAction2. Except for one model (MViT) that fails to compile, the performance of other models remains consistent before and after compilation. + +| Config | compiled | Train time / iter (s) | GPU memory (M) | test metric | +| ------------------------------------------------------------------------- | -------- | --------------------- | -------------- | ------------ | +| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb | False | 0.50 | 42537 | 36.55 | +| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb | True | 0.61 | 53149 | 36.72 | +| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb | False | 0.688 | 14263 | 77.69 | +| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb | True | 0.691 | 13863 | 77.57 | +| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d | False | 0.0305 | 1184 | 91.69 | +| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d | True | 0.0298 | 1273 | 91.64 | +| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint | False | 0.498 | 9581 | 93.6 | +| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint | True | 0.505 | 11968 | 93.49 | +| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb | False | 0.17 | 8278 | 20.76 | +| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb | True | 0.1835 | 12004 | 21.67 | +| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb | False | 0.323 | 21651 | 78.90 | +| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb | True | 0.262 | 20905 | 78.70 | +| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | False | 0.098 | 5777 | 75.12 | +| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | True | 0.0942 | 7095 | 75.15 | +| mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb | Fail | incompatible | incompatible | incompatible | diff --git a/docs/en/stat.py b/docs/en/stat.py index 80263653dc..b07d123fa8 100644 --- a/docs/en/stat.py +++ b/docs/en/stat.py @@ -16,7 +16,7 @@ def anchor(name): # Count algorithms -files = sorted(glob.glob('*_models.md')) +files = sorted(glob.glob('model_zoo/*_models.md')) # files = sorted(glob.glob('docs/*_models.md')) stats = [] @@ -99,76 +99,76 @@ def anchor(name): {msglist} """ -with open('modelzoo.md', 'w') as f: +with open('model_zoo/modelzoo.md', 'w') as f: f.write(modelzoo) -# # Count datasets -# -# files = ['supported_datasets.md'] -# # files = sorted(glob.glob('docs/tasks/*.md')) -# -# datastats = [] -# -# for f in files: -# with open(f, 'r') as content_file: -# content = content_file.read() -# -# # title -# title = content.split('\n')[0].replace('#', '') -# -# # count papers -# papers = set( -# (papertype, titlecase.titlecase(paper.lower().strip())) -# for (papertype, paper) in re.findall( -# r'\s*\n.*?\btitle\s*=\s*{(.*?)}', -# content, re.DOTALL)) -# # paper links -# revcontent = '\n'.join(list(reversed(content.splitlines()))) -# paperlinks = {} -# for _, p in papers: -# print(p) -# q = p.replace('\\', '\\\\').replace('?', '\\?') -# paperlinks[p] = ', '.join( -# (f'[{p.strip()} ->]({splitext(basename(f))[0]}.html#{anchor(p)})' -# for p in re.findall( -# rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n', -# revcontent, re.DOTALL | re.IGNORECASE))) -# print(' ', paperlinks[p]) -# paperlist = '\n'.join( -# sorted(f' - [{t}] {x} ({paperlinks[x]})' for t, x in papers)) -# -# statsmsg = f""" -# ## [{title}]({f}) -# -# * Number of papers: {len(papers)} -# {paperlist} -# -# """ -# -# datastats.append((papers, configs, ckpts, statsmsg)) -# -# alldatapapers = func.reduce(lambda a, b: a.union(b), -# [p for p, _, _, _ in datastats]) -# -# # Summarize -# -# msglist = '\n'.join(x for _, _, _, x in stats) -# datamsglist = '\n'.join(x for _, _, _, x in datastats) -# papertypes, papercounts = np.unique([t for t, _ in alldatapapers], -# return_counts=True) -# countstr = '\n'.join( -# [f' - {t}: {c}' for t, c in zip(papertypes, papercounts)]) -# -# modelzoo = f""" -# # Overview -# -# * Number of papers: {len(alldatapapers)} -# {countstr} -# -# For supported action algorithms, see [modelzoo overview](modelzoo.md). -# -# {datamsglist} -# """ -# -# with open('datasets.md', 'w') as f: -# f.write(modelzoo) +# Count datasets + +files = ['datasetzoo.md'] +# files = sorted(glob.glob('docs/tasks/*.md')) + +datastats = [] + +for f in files: + with open(f, 'r') as content_file: + content = content_file.read() + + # title + title = content.split('\n')[0].replace('#', '') + + # count papers + papers = set( + (papertype, titlecase.titlecase(paper.lower().strip())) + for (papertype, paper) in re.findall( + r'\s*\n.*?\btitle\s*=\s*{(.*?)}', + content, re.DOTALL)) + # paper links + revcontent = '\n'.join(list(reversed(content.splitlines()))) + paperlinks = {} + for _, p in papers: + print(p) + q = p.replace('\\', '\\\\').replace('?', '\\?') + paperlinks[p] = ', '.join( + (f'[{p.strip()} ->]({splitext(basename(f))[0]}.html#{anchor(p)})' + for p in re.findall( + rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n', + revcontent, re.DOTALL | re.IGNORECASE))) + print(' ', paperlinks[p]) + paperlist = '\n'.join( + sorted(f' - [{t}] {x} ({paperlinks[x]})' for t, x in papers)) + + statsmsg = f""" +## [{title}]({f}) + +* Number of papers: {len(papers)} +{paperlist} + + """ + + datastats.append((papers, configs, ckpts, statsmsg)) + +alldatapapers = func.reduce(lambda a, b: a.union(b), + [p for p, _, _, _ in datastats]) + +# Summarize + +msglist = '\n'.join(x for _, _, _, x in stats) +datamsglist = '\n'.join(x for _, _, _, x in datastats) +papertypes, papercounts = np.unique([t for t, _ in alldatapapers], + return_counts=True) +countstr = '\n'.join( + [f' - {t}: {c}' for t, c in zip(papertypes, papercounts)]) + +datasetzoo = f""" +# Overview + +* Number of papers: {len(alldatapapers)} +{countstr} + +For supported action algorithms, see [modelzoo overview](modelzoo.md). + +{datamsglist} +""" + +with open('datasetzoo_overview.md', 'w') as f: + f.write(datasetzoo) diff --git a/docs/en/supported_datasets.md b/docs/en/supported_datasets.md new file mode 100644 index 0000000000..42911fc8ff --- /dev/null +++ b/docs/en/supported_datasets.md @@ -0,0 +1,36 @@ +# Supported Datasets + +- Action Recognition + + - [UCF101](/tools/data/ucf101/README.md) \[ [Homepage](https://www.crcv.ucf.edu/research/data-sets/ucf101/) \]. + - [HMDB51](/tools/data/hmdb51/README.md) \[ [Homepage](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/) \]. + - [Kinetics-\[400/600/700\]](/tools/data/kinetics/README.md) \[ [Homepage](https://deepmind.com/research/open-source/kinetics) \] + - [Something-Something V1](/tools/data/sthv1/README.md) \[ [Homepage](https://20bn.com/datasets/something-something/v1) \] + - [Something-Something V2](/tools/data/sthv2/README.md) \[ [Homepage](https://20bn.com/datasets/something-something) \] + - [Moments in Time](/tools/data/mit/README.md) \[ [Homepage](http://moments.csail.mit.edu/) \] + - [Multi-Moments in Time](/tools/data/mmit/README.md) \[ [Homepage](http://moments.csail.mit.edu/challenge_iccv_2019.html) \] + - [HVU](/tools/data/hvu/README.md) \[ [Homepage](https://github.com/holistic-video-understanding/HVU-Dataset) \] + - [Jester](/tools/data/jester/README.md) \[ [Homepage](https://developer.qualcomm.com/software/ai-datasets/jester) \] + - [GYM](/tools/data/gym/README.md) \[ [Homepage](https://sdolivia.github.io/FineGym/) \] + - [ActivityNet](/tools/data/activitynet/README.md) \[ [Homepage](http://activity-net.org/) \] + - [Diving48](/tools/data/diving48/README.md) \[ [Homepage](http://www.svcl.ucsd.edu/projects/resound/dataset.html) \] + - [OmniSource](/tools/data/omnisource/README.md) \[ [Homepage](https://kennymckormick.github.io/omnisource/) \] + +- Temporal Action Detection + + - [ActivityNet](/tools/data/activitynet/README.md) \[ [Homepage](http://activity-net.org/) \] + - [THUMOS14](/tools/data/thumos14/README.md) \[ [Homepage](https://www.crcv.ucf.edu/THUMOS14/download.html) \] + +- Spatial Temporal Action Detection + + - [AVA](/tools/data/ava/README.md) \[ [Homepage](https://research.google.com/ava/index.html) \] + - [UCF101-24](/tools/data/ucf101_24/README.md) \[ [Homepage](http://www.thumos.info/download.html) \] + - [JHMDB](/tools/data/jhmdb/README.md) \[ [Homepage](http://jhmdb.is.tue.mpg.de/) \] + +- Skeleton-based Action Recognition + + - [PoseC3D Skeleton Dataset](/tools/data/skeleton/README.md) \[ [Homepage](https://kennymckormick.github.io/posec3d/) \] + +The supported datasets are listed above. +We provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`. +Below is the detailed tutorials of data deployment for each dataset. diff --git a/docs/en/user_guides/useful_tools.md b/docs/en/useful_tools.md similarity index 98% rename from docs/en/user_guides/useful_tools.md rename to docs/en/useful_tools.md index 2fe3b1977a..943303b82c 100644 --- a/docs/en/user_guides/useful_tools.md +++ b/docs/en/useful_tools.md @@ -1,4 +1,4 @@ -# Other Useful Tools +# Useful Tools Apart from training/testing scripts, We provide lots of useful tools under the `tools/` directory. @@ -6,7 +6,7 @@ Apart from training/testing scripts, We provide lots of useful tools under the ` -- [Other Useful Tools](#other-useful-tools) +- [Useful Tools](#useful-tools) - [Useful Tools Link](#useful-tools-link) - [Model Conversion](#model-conversion) - [Prepare a model for publishing](#prepare-a-model-for-publishing) diff --git a/docs/en/user_guides/2_data_prepare.md b/docs/en/user_guides/2_data_prepare.md deleted file mode 100644 index e3bcc9f0e0..0000000000 --- a/docs/en/user_guides/2_data_prepare.md +++ /dev/null @@ -1,152 +0,0 @@ -# Tutorial 2: Prepare Datasets - -We provide some tips for MMAction2 data preparation in this file. - - - -- [Notes on Video Data Format](#notes-on-video-data-format) -- [Getting Data](#getting-data) - - [Prepare videos](#prepare-videos) - - [Extract frames](#extract-frames) - - [Alternative to denseflow](#alternative-to-denseflow) - - [Generate file list](#generate-file-list) - - [Prepare audio](#prepare-audio) - - - -## Notes on Video Data Format - -MMAction2 supports two types of data format: raw frames and video. The former is widely used in previous projects such as [TSN](https://github.com/yjxiong/temporal-segment-networks). -This is fast when SSD is available but fails to scale to the fast-growing datasets. -(For example, the newest edition of [Kinetics](https://www.deepmind.com/open-source/kinetics) has 650K videos and the total frames will take up several TBs.) -The latter saves much space but has to do the computation intensive video decoding at execution time. -To make video decoding faster, we support several efficient video loading libraries, such as [decord](https://github.com/zhreshold/decord), [PyAV](https://github.com/PyAV-Org/PyAV), etc. - -## Getting Data - -The following guide is helpful when you want to experiment with custom dataset. -Similar to the datasets stated above, it is recommended organizing in `$MMACTION2/data/$DATASET`. - -### Prepare videos - -Please refer to the official website and/or the official script to prepare the videos. -Note that the videos should be arranged in either - -- A two-level directory organized by `${CLASS_NAME}/${VIDEO_ID}`, which is recommended to be used for action recognition datasets (such as UCF101 and Kinetics) - -- A single-level directory, which is recommended to be used for action detection datasets or those with multiple annotations per video (such as THUMOS14). - -### Extract frames - -To extract both frames and optical flow, you can use the tool [denseflow](https://github.com/open-mmlab/denseflow) we wrote. -Since different frame extraction tools produce different number of frames, -it is beneficial to use the same tool to do both frame extraction and the flow computation, to avoid mismatching of frame counts. - -```shell -python build_rawframes.py ${SRC_FOLDER} ${OUT_FOLDER} [--task ${TASK}] [--level ${LEVEL}] \ - [--num-worker ${NUM_WORKER}] [--flow-type ${FLOW_TYPE}] [--out-format ${OUT_FORMAT}] \ - [--ext ${EXT}] [--new-width ${NEW_WIDTH}] [--new-height ${NEW_HEIGHT}] [--new-short ${NEW_SHORT}] \ - [--resume] [--use-opencv] [--mixed-ext] -``` - -- `SRC_FOLDER`: Folder of the original video. -- `OUT_FOLDER`: Root folder where the extracted frames and optical flow store. -- `TASK`: Extraction task indicating which kind of frames to extract. Allowed choices are `rgb`, `flow`, `both`. -- `LEVEL`: Directory level. 1 for the single-level directory or 2 for the two-level directory. -- `NUM_WORKER`: Number of workers to build rawframes. -- `FLOW_TYPE`: Flow type to extract, e.g., `None`, `tvl1`, `warp_tvl1`, `farn`, `brox`. -- `OUT_FORMAT`: Output format for extracted frames, e.g., `jpg`, `h5`, `png`. -- `EXT`: Video file extension, e.g., `avi`, `mp4`. -- `NEW_WIDTH`: Resized image width of output. -- `NEW_HEIGHT`: Resized image height of output. -- `NEW_SHORT`: Resized image short side length keeping ratio. -- `--resume`: Whether to resume optical flow extraction instead of overwriting. -- `--use-opencv`: Whether to use OpenCV to extract rgb frames. -- `--mixed-ext`: Indicate whether process video files with mixed extensions. - -The recommended practice is - -1. set `$OUT_FOLDER` to be a folder located in SSD. -2. symlink the link `$OUT_FOLDER` to `$MMACTION2/data/$DATASET/rawframes`. -3. set `new-short` instead of using `new-width` and `new-height`. - -```shell -ln -s ${YOUR_FOLDER} $MMACTION2/data/$DATASET/rawframes -``` - -#### Alternative to denseflow - -In case your device doesn't fulfill the installation requirement of [denseflow](https://github.com/open-mmlab/denseflow)(like Nvidia driver version), or you just want to see some quick demos about flow extraction, we provide a python script `tools/misc/flow_extraction.py` as an alternative to denseflow. You can use it for rgb frames and optical flow extraction from one or several videos. Note that the speed of the script is much slower than denseflow, since it runs optical flow algorithms on CPU. - -```shell -python tools/misc/flow_extraction.py --input ${INPUT} [--prefix ${PREFIX}] [--dest ${DEST}] [--rgb-tmpl ${RGB_TMPL}] \ - [--flow-tmpl ${FLOW_TMPL}] [--start-idx ${START_IDX}] [--method ${METHOD}] [--bound ${BOUND}] [--save-rgb] -``` - -- `INPUT`: Videos for frame extraction, can be single video or a video list, the video list should be a txt file and just consists of filenames without directories. -- `PREFIX`: The prefix of input videos, used when input is a video list. -- `DEST`: The destination to save extracted frames. -- `RGB_TMPL`: The template filename of rgb frames. -- `FLOW_TMPL`: The template filename of flow frames. -- `START_IDX`: The start index of extracted frames. -- `METHOD`: The method used to generate flow. -- `BOUND`: The maximum of optical flow. -- `SAVE_RGB`: Also save extracted rgb frames. - -### Generate file list - -We provide a convenient script to generate annotation file list. You can use the following command to generate file lists given extracted frames / downloaded videos. - -```shell -cd $MMACTION2 -python tools/data/build_file_list.py ${DATASET} ${SRC_FOLDER} [--rgb-prefix ${RGB_PREFIX}] \ - [--flow-x-prefix ${FLOW_X_PREFIX}] [--flow-y-prefix ${FLOW_Y_PREFIX}] [--num-split ${NUM_SPLIT}] \ - [--subset ${SUBSET}] [--level ${LEVEL}] [--format ${FORMAT}] [--out-root-path ${OUT_ROOT_PATH}] \ - [--seed ${SEED}] [--shuffle] -``` - -- `DATASET`: Dataset to be prepared, e.g., `ucf101`, `kinetics400`, `thumos14`, `sthv1`, `sthv2`, etc. -- `SRC_FOLDER`: Folder of the corresponding data format: - - "$MMACTION2/data/$DATASET/rawframes" if `--format rawframes`. - - "$MMACTION2/data/$DATASET/videos" if `--format videos`. -- `RGB_PREFIX`: Name prefix of rgb frames. -- `FLOW_X_PREFIX`: Name prefix of x flow frames. -- `FLOW_Y_PREFIX`: Name prefix of y flow frames. -- `NUM_SPLIT`: Number of split to file list. -- `SUBSET`: Subset to generate file list. Allowed choice are `train`, `val`, `test`. -- `LEVEL`: Directory level. 1 for the single-level directory or 2 for the two-level directory. -- `FORMAT`: Source data format to generate file list. Allowed choices are `rawframes`, `videos`. -- `OUT_ROOT_PATH`: Root path for output -- `SEED`: Random seed. -- `--shuffle`: Whether to shuffle the file list. - -### Prepare audio - -We also provide a simple script for audio waveform extraction and mel-spectrogram generation. - -```shell -cd $MMACTION2 -python tools/data/extract_audio.py ${ROOT} ${DST_ROOT} [--ext ${EXT}] [--num-workers ${N_WORKERS}] \ - [--level ${LEVEL}] -``` - -- `ROOT`: The root directory of the videos. -- `DST_ROOT`: The destination root directory of the audios. -- `EXT`: Extension of the video files. e.g., `mp4`. -- `N_WORKERS`: Number of processes to be used. - -After extracting audios, you are free to decode and generate the spectrogram on-the-fly such as [this](/configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio.py). As for the annotations, you can directly use those of the rawframes as long as you keep the relative position of audio files same as the rawframes directory. However, extracting spectrogram on-the-fly is slow and bad for prototype iteration. Therefore, we also provide a script (and many useful tools to play with) for you to generation spectrogram off-line. - -```shell -cd $MMACTION2 -python tools/data/build_audio_features.py ${AUDIO_HOME_PATH} ${SPECTROGRAM_SAVE_PATH} [--level ${LEVEL}] \ - [--ext $EXT] [--num-workers $N_WORKERS] [--part $PART] -``` - -- `AUDIO_HOME_PATH`: The root directory of the audio files. -- `SPECTROGRAM_SAVE_PATH`: The destination root directory of the audio features. -- `EXT`: Extension of the audio files. e.g., `m4a`. -- `N_WORKERS`: Number of processes to be used. -- `PART`: Determines how many parts to be splited and which part to run. e.g., `2/5` means splitting all files into 5-fold and executing the 2nd part. This is useful if you have several machines. - -The annotations for audio spectrogram features are identical to those of rawframes. You can simply make a copy of `dataset_[train/val]_list_rawframes.txt` and rename it as `dataset_[train/val]_list_audio_feature.txt` diff --git a/docs/en/user_guides/3_inference.md b/docs/en/user_guides/Inference.md similarity index 95% rename from docs/en/user_guides/3_inference.md rename to docs/en/user_guides/Inference.md index 11b07f0519..20e14b4ee0 100644 --- a/docs/en/user_guides/3_inference.md +++ b/docs/en/user_guides/Inference.md @@ -1,9 +1,9 @@ -# Tutorial 3: Inference with existing models +# Inference with existing models MMAction2 provides pre-trained models for video understanding in [Model Zoo](../modelzoo.md). This note will show **how to use existing models to inference on given video**. -As for how to test existing models on standard datasets, please see this [guide](./4_train_test.md#test) +As for how to test existing models on standard datasets, please see this [guide](./train_test.md#test) ## Inference on a given video diff --git a/docs/en/user_guides/1_config.md b/docs/en/user_guides/config.md similarity index 98% rename from docs/en/user_guides/1_config.md rename to docs/en/user_guides/config.md index 308ec70f17..d847ae9557 100644 --- a/docs/en/user_guides/1_config.md +++ b/docs/en/user_guides/config.md @@ -1,4 +1,4 @@ -# Tutorial 1: Learn about Configs +# Learn about Configs We use python files as configs, incorporate modular and inheritance design into our config system, which is convenient to conduct various experiments. You can find all the provided configs under `$MMAction2/configs`. If you wish to inspect the config file, @@ -6,12 +6,13 @@ you may run `python tools/analysis_tools/print_config.py /PATH/TO/CONFIG` to see -- [Modify config through script arguments](#modify-config-through-script-arguments) -- [Config File Structure](#config-file-structure) -- [Config File Naming Convention](#config-file-naming-convention) - - [Config System for Action Recognition](#config-system-for-action-recognition) - - [Config System for Spatio-Temporal Action Detection](#config-system-for-spatio-temporal-action-detection) - - [Config System for Action localization](#config-system-for-action-localization) +- [Learn about Configs](#learn-about-configs) + - [Modify config through script arguments](#modify-config-through-script-arguments) + - [Config File Structure](#config-file-structure) + - [Config File Naming Convention](#config-file-naming-convention) + - [Config System for Action Recognition](#config-system-for-action-recognition) + - [Config System for Spatio-Temporal Action Detection](#config-system-for-spatio-temporal-action-detection) + - [Config System for Action localization](#config-system-for-action-localization) diff --git a/docs/en/user_guides/prepare_dataset.md b/docs/en/user_guides/prepare_dataset.md new file mode 100644 index 0000000000..cd4225aaa0 --- /dev/null +++ b/docs/en/user_guides/prepare_dataset.md @@ -0,0 +1,263 @@ +# Prepare Dataset + +MMAction2 supports many existing datasets. In this chapter, we will lead you to prepare datasets for MMAction2. + +- [Prepare Dataset](#prepare-dataset) + - [Notes on Video Data Format](#notes-on-video-data-format) + - [Use built-in datasets](#use-built-in-datasets) + - [Use a custom dataset](#use-a-custom-dataset) + - [Action Recognition](#action-recognition) + - [Skeleton-based Action Recognition](#skeleton-based-action-recognition) + - [Spatio-temporal Action Detection](#spatio-temporal-action-detection) + - [Temporal Action Localization](#temporal-action-localization) + - [Use mixed datasets for training](#use-mixed-datasets-for-training) + - [Repeat dataset](#repeat-dataset) + - [Browse dataset](#browse-dataset) + +## Notes on Video Data Format + +MMAction2 supports two types of data formats: raw frames and video. The former is widely used in previous projects such as [TSN](https://github.com/yjxiong/temporal-segment-networks). +This is fast when SSD is available but fails to scale to the fast-growing datasets. +(For example, the newest edition of [Kinetics](https://www.deepmind.com/open-source/kinetics) has 650K videos and the total frames will take up several TBs.) +The latter saves much space but has to do the computation intensive video decoding at execution time. +To make video decoding faster, we support several efficient video loading libraries, such as [decord](https://github.com/zhreshold/decord), [PyAV](https://github.com/PyAV-Org/PyAV), etc. + +## Use built-in datasets + +MMAction2 already supports many datasets, we provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`, please refer to [supported datasets](../supported_datasets.md) for details to prepare specific datasets. + +## Use a custom dataset + +The simplest way is to convert your dataset to existing dataset formats: + +- `RawFrameDataset` and `VideoDataset` for [Action Recognition](#action-recognition) +- `PoseDataset` for [Skeleton-based Action Recognition](#skeleton-based-action-recognition) +- `AVADataset` for [Spatio-temporal Action Detection](#spatio-temporal-action-detection) +- `ActivityNetDataset` for [Temporal Action Localization](#temporal-action-localization) + +After the data pre-processing, the users need to further modify the config files to use the dataset. +Here is an example of using a custom dataset in rawframe format. + +In `configs/task/method/my_custom_config.py`: + +```python +... +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'path/to/your/root' +data_root_val = 'path/to/your/root_val' +ann_file_train = 'data/custom/custom_train_list.txt' +ann_file_val = 'data/custom/custom_val_list.txt' +ann_file_test = 'data/custom/custom_val_list.txt' +... +data = dict( + videos_per_gpu=32, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + ...), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + ...), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + ...)) +... +``` + +### Action Recognition + +There are two kinds of annotation files for action recognition. + +- rawframe annotaiton for `RawFrameDataset` + + The annotation of a rawframe dataset is a text file with multiple lines, + and each line indicates `frame_directory` (relative path) of a video, + `total_frames` of a video and the `label` of a video, which are split by a whitespace. + + Here is an example. + + ``` + some/directory-1 163 1 + some/directory-2 122 1 + some/directory-3 258 2 + some/directory-4 234 2 + some/directory-5 295 3 + some/directory-6 121 3 + ``` + +- video annotation for `VideoDataset` + + The annotation of a video dataset is a text file with multiple lines, + and each line indicates a sample video with the `filepath` (relative path) and `label`, + which are split by a whitespace. + + Here is an example. + + ``` + some/path/000.mp4 1 + some/path/001.mp4 1 + some/path/002.mp4 2 + some/path/003.mp4 2 + some/path/004.mp4 3 + some/path/005.mp4 3 + ``` + +### Skeleton-based Action Recognition + +The task recognizes the action class based on the skeleton sequence (time sequence of keypoints). We provide some methods to build your custom skeleton dataset. + +- Build from RGB video data + + You need to extract keypoints data from video and convert it to a supported format, we provide a [tutorial](/configs/skeleton/posec3d/custom_dataset_training.md) with detailed instructions. + +- Build from existing keypoint data + + Assuming that you already have keypoint data in coco formats, you can gather them into a pickle file. + + Each pickle file corresponds to an action recognition dataset. The content of a pickle file is a dictionary with two fields: `split` and `annotations` + + 1. Split: The value of the `split` field is a dictionary: the keys are the split names, while the values are lists of video identifiers that belong to the specific clip. + 2. Annotations: The value of the `annotations` field is a list of skeleton annotations, each skeleton annotation is a dictionary, containing the following fields: + - `frame_dir` (str): The identifier of the corresponding video. + - `total_frames` (int): The number of frames in this video. + - `img_shape` (tuple\[int\]): The shape of a video frame, a tuple with two elements, in the format of `(height, width)`. Only required for 2D skeletons. + - `original_shape` (tuple\[int\]): Same as `img_shape`. + - `label` (int): The action label. + - `keypoint` (np.ndarray, with shape `[M x T x V x C]`): The keypoint annotation. + - M: number of persons; + - T: number of frames (same as `total_frames`); + - V: number of keypoints (25 for NTURGB+D 3D skeleton, 17 for CoCo, 18 for OpenPose, etc. ); + - C: number of dimensions for keypoint coordinates (C=2 for 2D keypoint, C=3 for 3D keypoint). + - `keypoint_score` (np.ndarray, with shape `[M x T x V]`): The confidence score of keypoints. Only required for 2D skeletons. + + Here is an example: + + ``` + { + "split": + { + 'xsub_train': + ['S001C001P001R001A001', ...], + 'xsub_val': + ['S001C001P003R001A001', ...], + ... + } + + "annotations: + [ + { + { + 'frame_dir': 'S001C001P001R001A001', + 'label': 0, + 'img_shape': (1080, 1920), + 'original_shape': (1080, 1920), + 'total_frames': 103, + 'keypoint': array([[[[1032. , 334.8], ...]]]) + 'keypoint_score': array([[[0.934 , 0.9766, ...]]]) + }, + { + 'frame_dir': 'S001C001P003R001A001', + ... + }, + ... + + } + ] + } + ``` + + Support other keypoint formats needs further modification, please refer to [customize dataset](../advanced_guides/customize_dataset.md). + +### Spatio-temporal Action Detection + +MMAction2 supports the task based on `AVADataset`. The annotation contains groundtruth bbox and proposal bbox. + +- groundtruth bbox + groundtruth bbox is a csv file with multiple lines, and each line is a detection sample of one frame, with following formats: + + video_identifier, time_stamp, lt_x, lt_y, rb_x, rb_y, label, entity_id + each field means: + `video_identifier` : The identifier of the corresponding video + `time_stamp`: The time stamp of current frame + `lt_x`: The normalized x-coordinate of the left top point of bounding box + `lt_y`: The normalized y-coordinate of the left top point of bounding box + `rb_y`: The normalized x-coordinate of the right bottom point of bounding box + `rb_y`: The normalized y-coordinate of the right bottom point of bounding box + `label`: The action label + `entity_id`: a unique integer allowing this box to be linked to other boxes depicting the same person in adjacent frames of this video + + Here is an example. + + ``` + _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,12,0 + _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,74,0 + ... + ``` + +- proposal bbox + proposal bbox is a pickle file generated by a person detector, and usually needs to be fine-tuned on the target dataset. The pickle file contains a dict with below data structure: + + `{'video_identifier,time_stamp': bbox_info}` + + video_identifier (str): The identifier of the corresponding video + time_stamp (int): The time stamp of current frame + bbox_info (np.ndarray, with shape `[n, 5]`): Detected bbox, \ \ \ \ \. x1, x2, y1, y2 are normalized with respect to frame size, which are between 0.0-1.0. + +### Temporal Action Localization + +We support Temporal Action Localization based on `ActivityNetDataset`. The annotation of ActivityNet dataset is a json file. Each key is a video name and the corresponding value is the meta data and annotation for the video. + +Here is an example. + +``` +{ + "video1": { + "duration_second": 211.53, + "duration_frame": 6337, + "annotations": [ + { + "segment": [ + 30.025882995319815, + 205.2318595943838 + ], + "label": "Rock climbing" + } + ], + "feature_frame": 6336, + "fps": 30.0, + "rfps": 29.9579255898 + }, + "video2": {... + } + ... +} +``` + +## Use mixed datasets for training + +MMAction2 also supports to mix dataset for training. Currently it supports to repeat dataset. + +### Repeat dataset + +We use `RepeatDataset` as wrapper to repeat the dataset. For example, suppose the original dataset as `Dataset_A`, +to repeat it, the config looks like the following + +```python +dataset_A_train = dict( + type='RepeatDataset', + times=N, + dataset=dict( # This is the original config of Dataset_A + type='Dataset_A', + ... + pipeline=train_pipeline + ) + ) +``` + +## Browse dataset + +coming soon... diff --git a/docs/en/user_guides/4_train_test.md b/docs/en/user_guides/train_test.md similarity index 99% rename from docs/en/user_guides/4_train_test.md rename to docs/en/user_guides/train_test.md index a67448fde3..653fccdc34 100644 --- a/docs/en/user_guides/4_train_test.md +++ b/docs/en/user_guides/train_test.md @@ -1,4 +1,4 @@ -# Tutorial 4: Training and Test +# Training and Test ## Training diff --git a/docs/en/user_guides/visualization.md b/docs/en/user_guides/visualization.md deleted file mode 100644 index 2d4518bcdb..0000000000 --- a/docs/en/user_guides/visualization.md +++ /dev/null @@ -1,20 +0,0 @@ -# Visualization Tools - -## Visualize dataset - -You can use `tools/analysis_tools/browse_dataset.py` to visualize video datasets: - -```bash -python tools/analysis_tools/browse_dataset.py ${CONFIG_FILE} [ARGS] -``` - -| ARGS | Description | -| ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `CONFIG_FILE` | The path to the config file. | -| `--output-dir OUTPUT_DIR` | If there is no display interface, you can save the visualization results to `OUTPUT_DIR`. Defaults to None | -| `--show-frames` | Display the frames of the video if you have the display interface. Defaults to False. | -| `--phase PHASE` | Phase of the dataset to visualize, accept `train`, `test` and `val`. Defaults to `train`. | -| `--show-number SHOW_NUMBER` | Number of images selected to visualize, must bigger than 0. Jf the number is bigger than length of dataset, show all the images in dataset. Defaults to "sys.maxsize", show all images in dataset | -| `--show-interval SHOW_INTERVAL` | The interval of show (s). Defaults to 2. | -| `--mode MODE` | Display mode: display original videos or transformed videos. `original` means show videos load from disk while `transformed` means to show videos after transformed. Defaults to `transformed`. | -| `--cfg-options CFG_OPTIONS` | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either `key="[a,b]"` or `key=a,b`. The argument also allows nested list/tuple values, e.g. `key="[(a,b),(c,d)]"`. Note that the quotation marks are necessary and that no white space is allowed. | diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index 59e3e49b53..2b69d6d2af 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -16,7 +16,7 @@ You can switch between Chinese and English documents in the lower-left corner of user_guides/1_config.md user_guides/2_data_prepare.md user_guides/3_inference.md - user_guides/4_train_test.md + user_guides/train_test.md .. toctree:: :maxdepth: 1 diff --git a/docs/zh_cn/user_guides/3_inference.md b/docs/zh_cn/user_guides/3_inference.md index 99433263df..14374ef432 100644 --- a/docs/zh_cn/user_guides/3_inference.md +++ b/docs/zh_cn/user_guides/3_inference.md @@ -3,7 +3,7 @@ MMAction2 在 [Model Zoo](../modelzoo.md) 中提供预训练的视频理解模型。 本教程将展示**如何使用现有模型对给定视频进行推理**。 -至于如何在标准数据集上测试现有模型,请参阅这该[指南](./4_train_test.md#test) +至于如何在标准数据集上测试现有模型,请参阅这该[指南](./train_test.md#test) ## 给定视频的推理 diff --git a/src/pytorch-sphinx-theme b/src/pytorch-sphinx-theme new file mode 160000 index 0000000000..6f42dcf38c --- /dev/null +++ b/src/pytorch-sphinx-theme @@ -0,0 +1 @@ +Subproject commit 6f42dcf38c529653bdf3347f551cb037a1a0f1cf diff --git a/tools/visualizations/browse_dataset.py b/tools/visualizations/browse_dataset.py index e6cf9b82c4..6fb720521e 100644 --- a/tools/visualizations/browse_dataset.py +++ b/tools/visualizations/browse_dataset.py @@ -21,13 +21,9 @@ def parse_args(): parser = argparse.ArgumentParser(description='Browse a dataset') parser.add_argument('config', help='train config file path') - parser.add_argument('--label', default=None, type=str, help='label file') parser.add_argument( - '--output-dir', - '-o', - default=None, - type=str, - help='If there is no display interface, you can save it.') + 'output_dir', default=None, type=str, help='output directory') + parser.add_argument('--label', default=None, type=str, help='label file') parser.add_argument( '--phase', '-p', diff --git a/tools/visualizations/vis_scheduler.py b/tools/visualizations/vis_scheduler.py index 6e1b744862..17daa34e6b 100644 --- a/tools/visualizations/vis_scheduler.py +++ b/tools/visualizations/vis_scheduler.py @@ -16,58 +16,7 @@ from mmengine.runner import Runner from mmengine.visualization import Visualizer from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn - - -class SimpleModel(BaseModel): - """simple model that do nothing in train_step.""" - - def __init__(self): - super(SimpleModel, self).__init__() - self.data_preprocessor = nn.Identity() - self.conv = nn.Conv2d(1, 1, 1) - - def forward(self, inputs, data_samples, mode='tensor'): - pass - - def train_step(self, data, optim_wrapper): - pass - - -class ParamRecordHook(Hook): - - def __init__(self, by_epoch): - super().__init__() - self.by_epoch = by_epoch - self.lr_list = [] - self.momentum_list = [] - self.task_id = 0 - self.progress = Progress(BarColumn(), MofNCompleteColumn(), - TextColumn('{task.description}')) - - def before_train(self, runner): - if self.by_epoch: - total = runner.train_loop.max_epochs - self.task_id = self.progress.add_task( - 'epochs', start=True, total=total) - else: - total = runner.train_loop.max_iters - self.task_id = self.progress.add_task( - 'iters', start=True, total=total) - self.progress.start() - - def after_train_epoch(self, runner): - if self.by_epoch: - self.progress.update(self.task_id, advance=1) - - def after_train_iter(self, runner, batch_idx, data_batch, outputs): - if not self.by_epoch: - self.progress.update(self.task_id, advance=1) - self.lr_list.append(runner.optim_wrapper.get_lr()['lr'][0]) - self.momentum_list.append( - runner.optim_wrapper.get_momentum()['momentum'][0]) - - def after_train(self, runner): - self.progress.stop() +from torch.utils.data import DataLoader def parse_args(): @@ -130,6 +79,58 @@ def parse_args(): return args +class SimpleModel(BaseModel): + """simple model that do nothing in train_step.""" + + def __init__(self): + super(SimpleModel, self).__init__() + self.data_preprocessor = nn.Identity() + self.conv = nn.Conv2d(1, 1, 1) + + def forward(self, inputs, data_samples, mode='tensor'): + pass + + def train_step(self, data, optim_wrapper): + pass + + +class ParamRecordHook(Hook): + + def __init__(self, by_epoch): + super().__init__() + self.by_epoch = by_epoch + self.lr_list = [] + self.momentum_list = [] + self.task_id = 0 + self.progress = Progress(BarColumn(), MofNCompleteColumn(), + TextColumn('{task.description}')) + + def before_train(self, runner): + if self.by_epoch: + total = runner.train_loop.max_epochs + self.task_id = self.progress.add_task( + 'epochs', start=True, total=total) + else: + total = runner.train_loop.max_iters + self.task_id = self.progress.add_task( + 'iters', start=True, total=total) + self.progress.start() + + def after_train_epoch(self, runner): + if self.by_epoch: + self.progress.update(self.task_id, advance=1) + + def after_train_iter(self, runner, batch_idx, data_batch, outputs): + if not self.by_epoch: + self.progress.update(self.task_id, advance=1) + self.lr_list.append(runner.optim_wrapper.get_lr()['lr'][0]) + self.momentum_list.append( + runner.optim_wrapper.get_momentum()['momentum'][0]) + + def after_train(self, runner): + self.progress.stop() + + def plot_curve(lr_list, args, param_name, iters_per_epoch, by_epoch=True): """Plot learning rate vs iter graph.""" try: @@ -186,6 +187,7 @@ def simulate_train(data_loader, cfg, by_epoch): param_scheduler=cfg.param_scheduler, default_scope=cfg.default_scope, default_hooks=default_hooks, + auto_scale_lr=cfg.get('auto_scale_lr'), visualizer=MagicMock(spec=Visualizer), custom_hooks=cfg.get('custom_hooks', None)) @@ -231,14 +233,13 @@ def main(): from mmaction.registry import DATASETS dataset_size = len(DATASETS.build(cfg.train_dataloader.dataset)) print(f'dataset is {dataset_size}') - # dataset_size = len(build_dataset(cfg.train_dataloader.dataset)) else: dataset_size = args.dataset_size or batch_size - class FakeDataloader(list): - dataset = MagicMock(metainfo=None) - - data_loader = FakeDataloader(range(dataset_size // batch_size)) + data_loader = DataLoader(range(dataset_size), batch_size) + assert len(data_loader) > 0, \ + 'Please decrease batchsize to make sure that ' \ + 'a epoch at least have one iteration!' dataset_info = ( f'\nDataset infos:' f'\n - Dataset size: {dataset_size}' From 1d261c93b94bd49c00643b57bead5431853a0074 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Thu, 6 Apr 2023 18:41:29 +0800 Subject: [PATCH 33/36] [Feat] support training uniformer (#2221) --- configs/recognition/uniformerv2/README.md | 66 +- .../k710_channel_map/label_map_k710.txt | 710 ++++++++++++++++++ .../k710_channel_map/map_k400.json | 1 + .../k710_channel_map/map_k600.json | 1 + .../k710_channel_map/map_k700.json | 1 + configs/recognition/uniformerv2/metafile.yml | 97 ++- ...etics710-kinetics-k400-pre_u8_mitv1-rgb.py | 110 ++- ...clip-kinetics710-pre_u8_kinetics400-rgb.py | 122 ++- ...clip-kinetics710-pre_u8_kinetics600-rgb.py | 122 ++- ...clip-kinetics710-pre_u8_kinetics700-rgb.py | 122 ++- ...base-p16-res224_clip_u8_kinetics400-rgb.py | 163 ++++ ...base-p16-res224_clip_u8_kinetics700-rgb.py | 163 ++++ mmaction/models/backbones/uniformer.py | 8 +- mmaction/models/backbones/uniformerv2.py | 35 +- mmaction/models/heads/__init__.py | 3 +- mmaction/models/heads/uniformer_head.py | 98 +++ tests/models/backbones/test_uniformerv2.py | 2 + tests/models/utils/test_gradcam.py | 4 +- tests/utils/test_misc.py | 4 + tests/visualization/test_action_visualizer.py | 4 + tests/visualization/test_video_backend.py | 4 + 21 files changed, 1727 insertions(+), 113 deletions(-) create mode 100644 configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt create mode 100644 configs/recognition/uniformerv2/k710_channel_map/map_k400.json create mode 100644 configs/recognition/uniformerv2/k710_channel_map/map_k600.json create mode 100644 configs/recognition/uniformerv2/k710_channel_map/map_k700.json create mode 100644 configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics400-rgb.py create mode 100644 configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics700-rgb.py create mode 100644 mmaction/models/heads/uniformer_head.py diff --git a/configs/recognition/uniformerv2/README.md b/configs/recognition/uniformerv2/README.md index 73855f13f0..d6e57c7bf9 100644 --- a/configs/recognition/uniformerv2/README.md +++ b/configs/recognition/uniformerv2/README.md @@ -20,51 +20,53 @@ Learning discriminative spatiotemporal representation is the key problem of vide ### Kinetics-400 -| uniform sampling | resolution | backbone | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | -| :--------------: | :------------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| 8 | short-side 320 | UniFormerV2-B/16 | 85.8 | 97.1 | 85.6 | 97.0 | 85.8 | 97.1 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth) | -| 8 | short-side 320 | UniFormerV2-L/14 | 88.7 | 98.1 | 88.8 | 98.1 | 88.7 | 98.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth) | -| 16 | short-side 320 | UniFormerV2-L/14 | 89.0 | 98.2 | 89.1 | 98.2 | 89.0 | 98.2 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth) | -| 32 | short-side 320 | UniFormerV2-L/14 | 89.3 | 98.2 | 89.3 | 98.2 | 89.4 | 98.2 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth) | -| 32 | short-side 320 | UniFormerV2-L/14@336 | 89.5 | 98.4 | 89.7 | 98.3 | 89.5 | 98.4 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth) | +| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :--------------: | :------------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | short-side 320 | UniFormerV2-B/16 | clip | 84.3 | 96.4 | 84.4 | 96.3 | - | - | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log) | +| 8 | short-side 320 | UniFormerV2-B/16 | clip-kinetics710 | 85.8 | 97.1 | 85.6 | 97.0 | - | - | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log) | +| 8 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 88.7 | 98.1 | 88.8 | 98.1 | 88.7 | 98.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth) | - | +| 16 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 89.0 | 98.2 | 89.1 | 98.2 | 89.0 | 98.2 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth) | - | +| 32 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 89.3 | 98.2 | 89.3 | 98.2 | 89.4 | 98.2 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth) | - | +| 32 | short-side 320 | UniFormerV2-L/14@336\* | clip-kinetics710 | 89.5 | 98.4 | 89.7 | 98.3 | 89.5 | 98.4 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth) | - | ### Kinetics-600 -| uniform sampling | resolution | backbone | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | -| :--------------: | :--------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| 8 | Raw | UniFormerV2-B/16 | 86.4 | 97.3 | 86.1 | 97.2 | 85.5 | 97.0 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-c62c4da4.pth) | -| 8 | Raw | UniFormerV2-L/14 | 89.0 | 98.3 | 89.0 | 98.2 | 87.5 | 98.0 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth) | -| 16 | Raw | UniFormerV2-L/14 | 89.4 | 98.3 | 89.4 | 98.3 | 87.8 | 98.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth) | -| 32 | Raw | UniFormerV2-L/14 | 89.2 | 98.3 | 89.5 | 98.3 | 87.7 | 98.1 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth) | -| 32 | Raw | UniFormerV2-L/14@336 | 89.8 | 98.5 | 89.9 | 98.5 | 88.8 | 98.3 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth) | +| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710 | 86.4 | 97.3 | 86.1 | 97.2 | - | - | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log) | +| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.0 | 98.3 | 89.0 | 98.2 | 87.5 | 98.0 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth) | - | +| 16 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.4 | 98.3 | 89.4 | 98.3 | 87.8 | 98.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth) | - | +| 32 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.2 | 98.3 | 89.5 | 98.3 | 87.7 | 98.1 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth) | - | +| 32 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710 | 89.8 | 98.5 | 89.9 | 98.5 | 88.8 | 98.3 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth) | - | ### Kinetics-700 -| uniform sampling | resolution | backbone | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | -| :--------------: | :--------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| 8 | Raw | UniFormerV2-B/16 | 76.3 | 92.9 | 76.3 | 92.7 | 75.1 | 92.5 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-8a7c4ac4.pth) | -| 8 | Raw | UniFormerV2-L/14 | 80.8 | 95.2 | 80.8 | 95.4 | 79.4 | 94.8 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth) | -| 16 | Raw | UniFormerV2-L/14 | 81.2 | 95.6 | 81.2 | 95.6 | 79.2 | 95.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth) | -| 32 | Raw | UniFormerV2-L/14 | 81.4 | 95.7 | 81.5 | 95.7 | 79.8 | 95.3 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth) | -| 32 | Raw | UniFormerV2-L/14@336 | 82.1 | 96.0 | 82.1 | 96.1 | 80.6 | 95.6 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth) | +| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | Raw | UniFormerV2-B/16 | clip | 75.9 | 92.9 | 75.8 | 92.8 | - | - | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log) | +| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710 | 76.3 | 92.9 | 76.3 | 92.7 | - | - | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log) | +| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 80.8 | 95.2 | 80.8 | 95.4 | 79.4 | 94.8 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth) | - | +| 16 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 81.2 | 95.6 | 81.2 | 95.6 | 79.2 | 95.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth) | - | +| 32 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 81.4 | 95.7 | 81.5 | 95.7 | 79.8 | 95.3 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth) | - | +| 32 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710 | 82.1 | 96.0 | 82.1 | 96.1 | 80.6 | 95.6 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth) | - | ### MiTv1 -| uniform sampling | resolution | backbone | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | testing protocol | FLOPs | params | config | ckpt | -| :--------------: | :--------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| 8 | Raw | UniFormerV2-B/16 | 42.7 | 71.6 | 42.6 | 71.7 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-fddbc786.pth) | -| 8 | Raw | UniFormerV2-L/14 | 47.0 | 76.1 | 47.0 | 76.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth) | -| 8 | Raw | UniFormerV2-L/14@336 | 47.7 | 76.8 | 47.8 | 76.0 | 4 clips x 3 crop | 1.6T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth) | +| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | testing protocol | FLOPs | params | config | ckpt | log | +| :--------------: | :--------: | :--------------------: | :--------------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710-kinetics400 | 42.3 | 71.5 | 42.6 | 71.7 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log) | +| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710-kinetics400 | 47.0 | 76.1 | 47.0 | 76.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth) | - | +| 8 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710-kinetics400 | 47.7 | 76.8 | 47.8 | 76.0 | 4 clips x 3 crop | 1.6T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth) | - | ### Kinetics-710 -| uniform sampling | resolution | backbone | config | ckpt | -| :--------------: | :--------: | :------------------: | :----------------------------------------------------------------------------: | :--------------------------------------------------------------------------: | -| 8 | Raw | UniFormerV2-B/16 | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth) | -| 8 | Raw | UniFormerV2-L/14 | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20221219-bfaae587.pth) | -| 8 | Raw | UniFormerV2-L/14@336 | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20221219-55878cdc.pth) | +| uniform sampling | resolution | backbone | pretrain | config | ckpt | +| :--------------: | :--------: | :--------------------: | :------: | :-----------------------------------------------------------------------: | :---------------------------------------------------------------------: | +| 8 | Raw | UniFormerV2-B/16\* | clip | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth) | +| 8 | Raw | UniFormerV2-L/14\* | clip | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20221219-bfaae587.pth) | +| 8 | Raw | UniFormerV2-L/14@336\* | clip | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20221219-55878cdc.pth) | -The models are ported from the repo [UniFormerV2](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) and tested on our data. Currently, we only support the testing of UniFormerV2 models, training will be available soon. +The models with * are ported from the repo [UniFormerV2](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) and tested on our data. Due to computational limitations, we only support reliable training config for base model (i.e. UniFormerV2-B/16). 1. The values in columns named after "reference" are the results of the original repo. 2. The values in `top1/5 acc` is tested on the same data list as the original repo, and the label map is provided by [UniFormerV2](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL). diff --git a/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt b/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt new file mode 100644 index 0000000000..150f3447b4 --- /dev/null +++ b/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt @@ -0,0 +1,710 @@ +riding a bike 0 +marching 1 +dodgeball 2 +playing cymbals 3 +checking tires 4 +roller skating 5 +tasting beer 6 +clapping 7 +drawing 8 +juggling fire 9 +bobsledding 10 +petting animal (not cat) 11 +spray painting 12 +training dog 13 +eating watermelon 14 +building cabinet 15 +applauding 16 +playing harp 17 +inflating balloons 18 +sled dog racing 19 +wrestling 20 +pole vault 21 +hurling (sport) 22 +riding scooter 23 +shearing sheep 24 +sweeping floor 25 +eating carrots 26 +skateboarding 27 +dunking basketball 28 +disc golfing 29 +eating spaghetti 30 +playing flute 31 +riding mechanical bull 32 +making sushi 33 +trapezing 34 +picking apples 35 +stretching leg 36 +playing ukulele 37 +tying necktie 38 +skydiving 39 +playing cello 40 +jumping into pool 41 +shooting goal (soccer) 42 +trimming trees 43 +bookbinding 44 +ski jumping 45 +walking the dog 46 +riding unicycle 47 +shaving head 48 +hopscotch 49 +playing piano 50 +parasailing 51 +bartending 52 +kicking field goal 53 +finger snapping 54 +dining 55 +yawning 56 +peeling potatoes 57 +canoeing or kayaking 58 +front raises 59 +laughing 60 +dancing macarena 61 +digging 62 +reading newspaper 63 +hitting baseball 64 +clay pottery making 65 +exercising with an exercise ball 66 +playing saxophone 67 +shooting basketball 68 +washing hair 69 +lunge 70 +brushing hair 71 +curling hair 72 +kitesurfing 73 +tapping guitar 74 +bending back 75 +skipping rope 76 +situp 77 +folding paper 78 +cracking neck 79 +assembling computer 80 +cleaning gutters 81 +blowing out candles 82 +shaking hands 83 +dancing gangnam style 84 +windsurfing 85 +tap dancing 86 +skiing mono 87 +bandaging 88 +push up 89 +doing nails 90 +punching person (boxing) 91 +bouncing on trampoline 92 +scrambling eggs 93 +singing 94 +cleaning floor 95 +krumping 96 +drumming fingers 97 +snowmobiling 98 +gymnastics tumbling 99 +headbanging 100 +catching or throwing frisbee 101 +riding elephant 102 +bee keeping 103 +feeding birds 104 +snatch weight lifting 105 +mowing lawn 106 +fixing hair 107 +playing trumpet 108 +flying kite 109 +crossing river 110 +swinging legs 111 +sanding floor 112 +belly dancing 113 +sneezing 114 +clean and jerk 115 +side kick 116 +filling eyebrows 117 +shuffling cards 118 +recording music 119 +cartwheeling 120 +feeding fish 121 +folding clothes 122 +water skiing 123 +tobogganing 124 +blowing leaves 125 +smoking 126 +unboxing 127 +tai chi 128 +waxing legs 129 +riding camel 130 +slapping 131 +tossing salad 132 +capoeira 133 +playing cards 134 +playing organ 135 +playing violin 136 +playing drums 137 +tapping pen 138 +vault 139 +shoveling snow 140 +playing tennis 141 +getting a tattoo 142 +making a sandwich 143 +making tea 144 +grinding meat 145 +squat 146 +eating doughnuts 147 +ice fishing 148 +snowkiting 149 +kicking soccer ball 150 +playing controller 151 +giving or receiving award 152 +welding 153 +throwing discus 154 +throwing axe 155 +ripping paper 156 +swimming butterfly stroke 157 +air drumming 158 +blowing nose 159 +hockey stop 160 +taking a shower 161 +bench pressing 162 +planting trees 163 +pumping fist 164 +climbing tree 165 +tickling 166 +high kick 167 +waiting in line 168 +slacklining 169 +tango dancing 170 +hurdling 171 +carrying baby 172 +celebrating 173 +sharpening knives 174 +passing American football (in game) 175 +headbutting 176 +playing recorder 177 +brush painting 178 +person collecting garbage 179 +robot dancing 180 +shredding paper 181 +pumping gas 182 +rock climbing 183 +hula hooping 184 +braiding hair 185 +opening present 186 +texting 187 +decorating the christmas tree 188 +answering questions 189 +playing keyboard 190 +writing 191 +bungee jumping 192 +smelling feet 193 +eating burger 194 +playing accordion 195 +making pizza 196 +playing volleyball 197 +tasting food 198 +pushing cart 199 +spinning poi 200 +cleaning windows 201 +arm wrestling 202 +changing oil 203 +swimming breast stroke 204 +tossing coin 205 +deadlifting 206 +hoverboarding 207 +cutting watermelon 208 +cheerleading 209 +snorkeling 210 +washing hands 211 +eating cake 212 +pull ups 213 +surfing water 214 +eating hotdog 215 +holding snake 216 +playing harmonica 217 +ironing 218 +cutting nails 219 +golf chipping 220 +shot put 221 +hugging (not baby) 222 +playing clarinet 223 +faceplanting 224 +trimming or shaving beard 225 +drinking shots 226 +riding mountain bike 227 +tying bow tie 228 +swinging on something 229 +skiing crosscountry 230 +unloading truck 231 +cleaning pool 232 +jogging 233 +ice climbing 234 +mopping floor 235 +making the bed 236 +diving cliff 237 +washing dishes 238 +grooming dog 239 +weaving basket 240 +frying vegetables 241 +stomping grapes 242 +moving furniture 243 +cooking sausages (not on barbeque) 244 +doing laundry 245 +dyeing hair 246 +knitting 247 +reading book 248 +baby waking up 249 +punching bag 250 +surfing crowd 251 +cooking chicken 252 +pushing car 253 +springboard diving 254 +swing dancing 255 +massaging legs 256 +beatboxing 257 +breading or breadcrumbing 258 +somersaulting 259 +brushing teeth 260 +stretching arm 261 +juggling balls 262 +massaging person's head 263 +eating ice cream 264 +extinguishing fire 265 +hammer throw 266 +whistling 267 +crawling baby 268 +using remote controller (not gaming) 269 +playing cricket 270 +opening bottle (not wine) 271 +playing xylophone 272 +motorcycling 273 +driving car 274 +exercising arm 275 +passing American football (not in game) 276 +playing kickball 277 +sticking tongue out 278 +flipping pancake 279 +catching fish 280 +eating chips 281 +shaking head 282 +sword fighting 283 +playing poker 284 +cooking on campfire 285 +doing aerobics 286 +paragliding 287 +using segway 288 +folding napkins 289 +playing bagpipes 290 +gargling 291 +skiing slalom 292 +strumming guitar 293 +javelin throw 294 +waxing back 295 +riding or walking with horse 296 +plastering 297 +long jump 298 +parkour 299 +wrapping present 300 +egg hunting 301 +archery 302 +cleaning toilet 303 +swimming backstroke 304 +snowboarding 305 +catching or throwing baseball 306 +massaging back 307 +blowing glass 308 +playing guitar 309 +playing chess 310 +golf driving 311 +presenting weather forecast 312 +rock scissors paper 313 +high jump 314 +baking cookies 315 +using computer 316 +washing feet 317 +arranging flowers 318 +playing bass guitar 319 +spraying 320 +cutting pineapple 321 +waxing chest 322 +auctioning 323 +jetskiing 324 +sipping cup 325 +busking 326 +playing monopoly 327 +salsa dancing 328 +waxing eyebrows 329 +watering plants 330 +zumba 331 +chopping wood 332 +pushing wheelchair 333 +carving pumpkin 334 +building shed 335 +making jewelry 336 +catching or throwing softball 337 +bending metal 338 +ice skating 339 +dancing charleston 340 +abseiling 341 +climbing a rope 342 +crying 343 +cleaning shoes 344 +dancing ballet 345 +driving tractor 346 +triple jump 347 +throwing ball 348 +getting a haircut 349 +running on treadmill 350 +climbing ladder 351 +blasting sand 352 +playing trombone 353 +drop kicking 354 +country line dancing 355 +changing wheel (not on bike) 356 +feeding goats 357 +tying knot (not on a tie) 358 +setting table 359 +shaving legs 360 +kissing 361 +riding mule 362 +counting money 363 +laying bricks 364 +barbequing 365 +news anchoring 366 +smoking hookah 367 +cooking egg 368 +peeling apples 369 +yoga 370 +sharpening pencil 371 +dribbling basketball 372 +petting cat 373 +playing ice hockey 374 +milking cow 375 +shining shoes 376 +juggling soccer ball 377 +scuba diving 378 +playing squash or racquetball 379 +drinking beer 380 +sign language interpreting 381 +playing basketball 382 +breakdancing 383 +testifying 384 +making snowman 385 +golf putting 386 +playing didgeridoo 387 +biking through snow 388 +sailing 389 +jumpstyle dancing 390 +water sliding 391 +grooming horse 392 +massaging feet 393 +playing paintball 394 +making a cake 395 +bowling 396 +contact juggling 397 +applying cream 398 +playing badminton 399 +poaching eggs 400 +playing nose flute 401 +entering church 402 +closing door 403 +helmet diving 404 +doing sudoku 405 +coughing 406 +seasoning food 407 +peeling banana 408 +eating nachos 409 +waxing armpits 410 +shouting 411 +silent disco 412 +polishing furniture 413 +taking photo 414 +dealing cards 415 +putting wallpaper on wall 416 +uncorking champagne 417 +curling eyelashes 418 +brushing floor 419 +pulling espresso shot 420 +playing american football 421 +grooming cat 422 +playing checkers 423 +moving child 424 +stacking cups 425 +squeezing orange 426 +opening coconuts 427 +rolling eyes 428 +picking blueberries 429 +playing road hockey 430 +carving wood with a knife 431 +slicing onion 432 +saluting 433 +letting go of balloon 434 +breaking glass 435 +carrying weight 436 +mixing colours 437 +moving baby 438 +blending fruit 439 +pouring milk 440 +surveying 441 +making slime 442 +sieving 443 +walking with crutches 444 +flipping bottle 445 +playing billiards 446 +arresting 447 +listening with headphones 448 +spinning plates 449 +carving marble 450 +cutting cake 451 +shoot dance 452 +being excited 453 +petting horse 454 +splashing water 455 +filling cake 456 +stacking dice 457 +checking watch 458 +treating wood 459 +laying decking 460 +shooting off fireworks 461 +pouring wine 462 +pretending to be a statue 463 +steering car 464 +playing rounders 465 +looking in mirror 466 +jumping sofa 467 +lighting candle 468 +walking on stilts 469 +crocheting 470 +playing piccolo 471 +vacuuming car 472 +high fiving 473 +playing shuffleboard 474 +chasing 475 +pulling rope (game) 476 +being in zero gravity 477 +sanding wood 478 +decoupage 479 +using megaphone 480 +making latte art 481 +ski ballet 482 +playing oboe 483 +bouncing ball (not juggling) 484 +playing mahjong 485 +herding cattle 486 +swimming with sharks 487 +milking goat 488 +swimming with dolphins 489 +metal detecting 490 +playing slot machine 491 +polishing metal 492 +throwing tantrum 493 +lawn mower racing 494 +laying stone 495 +cutting orange 496 +skipping stone 497 +pouring beer 498 +making bubbles 499 +jaywalking 500 +leatherworking 501 +card stacking 502 +putting on eyeliner 503 +card throwing 504 +chewing gum 505 +falling off bike 506 +repairing puncture 507 +dumpster diving 508 +tiptoeing 509 +sleeping 510 +using circular saw 511 +cracking knuckles 512 +pinching 513 +chiseling wood 514 +playing rubiks cube 515 +weaving fabric 516 +fencing (sport) 517 +sword swallowing 518 +lighting fire 519 +vacuuming floor 520 +combing hair 521 +building lego 522 +playing pinball 523 +fly tying 524 +playing lute 525 +opening door 526 +waving hand 527 +rolling pastry 528 +chiseling stone 529 +threading needle 530 +playing dominoes 531 +opening wine bottle 532 +playing with trains 533 +steer roping 534 +playing field hockey 535 +separating eggs 536 +sewing 537 +talking on cell phone 538 +needle felting 539 +pushing wheelbarrow 540 +using a paint roller 541 +playing netball 542 +lifting hat 543 +massaging neck 544 +blowing bubble gum 545 +walking through snow 546 +docking boat 547 +clam digging 548 +marriage proposal 549 +packing 550 +sausage making 551 +licking 552 +scrapbooking 553 +flint knapping 554 +lock picking 555 +putting on lipstick 556 +sawing wood 557 +playing hand clapping games 558 +geocaching 559 +looking at phone 560 +making cheese 561 +poking bellybutton 562 +contorting 563 +fixing bicycle 564 +using a microscope 565 +using a wrench 566 +doing jigsaw puzzle 567 +making horseshoes 568 +cooking scallops 569 +square dancing 570 +getting a piercing 571 +playing ocarina 572 +making paper aeroplanes 573 +playing scrabble 574 +visiting the zoo 575 +crossing eyes 576 +jumping bicycle 577 +throwing water balloon 578 +bodysurfing 579 +pirouetting 580 +luge 581 +spelunking 582 +watching tv 583 +attending conference 584 +curling (sport) 585 +directing traffic 586 +swimming front crawl 587 +ice swimming 588 +battle rope training 589 +putting on mascara 590 +bouncing on bouncy castle 591 +smoking pipe 592 +pillow fight 593 +putting on sari 594 +calligraphy 595 +roasting pig 596 +cracking back 597 +shopping 598 +burping 599 +using bagging machine 600 +staring 601 +shucking oysters 602 +blowdrying hair 603 +smashing 604 +playing laser tag 605 +wading through mud 606 +rope pushdown 607 +preparing salad 608 +making balloon shapes 609 +tagging graffiti 610 +adjusting glasses 611 +using a power drill 612 +trimming shrubs 613 +popping balloons 614 +playing pan pipes 615 +using puppets 616 +arguing 617 +backflip (human) 618 +riding snow blower 619 +hand washing clothes 620 +calculating 621 +gospel singing in church 622 +standing on hands 623 +tasting wine 624 +shaping bread dough 625 +wading through water 626 +falling off chair 627 +throwing snowballs 628 +building sandcastle 629 +land sailing 630 +tying shoe laces 631 +jumping jacks 632 +wood burning (art) 633 +putting on foundation 634 +putting on shoes 635 +cumbia 636 +archaeological excavation 637 +mountain climber (exercise) 638 +assembling bicycle 639 +head stand 640 +cutting apple 641 +shuffling feet 642 +bottling 643 +breathing fire 644 +using inhaler 645 +historical reenactment 646 +hugging baby 647 +mushroom foraging 648 +delivering mail 649 +laying tiles 650 +using atm 651 +chopping meat 652 +tightrope walking 653 +mosh pit dancing 654 +photobombing 655 +coloring in 656 +huddling 657 +playing gong 658 +laying concrete 659 +breaking boards 660 +acting in play 661 +base jumping 662 +tie dying 663 +using a sledge hammer 664 +playing ping pong 665 +photocopying 666 +winking 667 +waking up 668 +swinging baseball bat 669 +twiddling fingers 670 +playing polo 671 +longboarding 672 +ironing hair 673 +bathing dog 674 +moon walking 675 +playing marbles 676 +embroidering 677 +playing beer pong 678 +home roasting coffee 679 +gold panning 680 +karaoke 681 +changing gear in car 682 +raising eyebrows 683 +yarn spinning 684 +scrubbing face 685 +fidgeting 686 +planing wood 687 +cosplaying 688 +capsizing 689 +tackling 690 +shining flashlight 691 +dyeing eyebrows 692 +drooling 693 +alligator wrestling 694 +playing blackjack 695 +carving ice 696 +playing maracas 697 +opening refrigerator 698 +throwing knife 699 +putting in contact lenses 700 +passing soccer ball 701 +casting fishing line 702 +sucking lolly 703 +installing carpet 704 +bulldozing 705 +roasting marshmallows 706 +playing darts 707 +chopping vegetables 708 +bull fighting 709 diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k400.json b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json new file mode 100644 index 0000000000..f97fa4d49f --- /dev/null +++ b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json @@ -0,0 +1 @@ +[341, 158, 189, 16, 398, 302, 202, 318, 80, 323, 249, 315, 18, 88, 365, 52, 257, 103, 113, 162, 75, 338, 388, 352, 308, 125, 159, 82, 10, 44, 92, 396, 185, 258, 383, 178, 71, 260, 15, 335, 192, 326, 58, 133, 172, 120, 334, 280, 306, 101, 337, 173, 203, 356, 4, 209, 332, 7, 65, 115, 95, 81, 232, 344, 303, 201, 342, 351, 165, 397, 252, 368, 285, 244, 363, 355, 79, 268, 110, 343, 72, 219, 321, 208, 345, 340, 84, 61, 206, 188, 62, 55, 29, 237, 2, 286, 245, 90, 8, 372, 325, 380, 226, 274, 346, 354, 97, 28, 246, 194, 212, 26, 281, 147, 215, 264, 30, 14, 301, 275, 66, 265, 224, 104, 121, 357, 117, 54, 107, 279, 109, 122, 289, 78, 59, 241, 179, 291, 349, 142, 152, 220, 311, 386, 145, 239, 392, 99, 266, 100, 176, 314, 167, 64, 160, 216, 49, 207, 222, 184, 171, 22, 234, 148, 339, 218, 294, 324, 233, 262, 9, 377, 41, 390, 53, 150, 361, 73, 247, 96, 60, 364, 298, 70, 395, 143, 236, 336, 196, 385, 33, 144, 1, 307, 393, 256, 263, 375, 235, 273, 243, 106, 366, 271, 186, 287, 51, 299, 175, 276, 369, 57, 11, 373, 35, 163, 297, 195, 399, 290, 382, 319, 134, 40, 310, 223, 151, 270, 3, 387, 137, 31, 309, 217, 17, 374, 190, 277, 327, 135, 394, 50, 284, 177, 67, 379, 141, 353, 108, 37, 136, 197, 272, 21, 312, 213, 164, 182, 250, 91, 89, 253, 199, 333, 248, 63, 119, 0, 130, 102, 32, 227, 362, 296, 23, 47, 156, 180, 183, 313, 5, 350, 389, 328, 112, 93, 378, 359, 83, 282, 174, 371, 48, 360, 24, 376, 68, 42, 221, 140, 181, 118, 116, 381, 94, 77, 27, 45, 87, 230, 292, 76, 39, 169, 131, 19, 126, 367, 105, 114, 193, 210, 305, 149, 98, 259, 200, 12, 320, 254, 146, 278, 242, 261, 36, 293, 251, 214, 25, 304, 204, 157, 255, 111, 229, 283, 128, 161, 170, 86, 74, 138, 6, 198, 384, 187, 155, 348, 154, 166, 124, 205, 132, 13, 34, 225, 43, 347, 228, 358, 38, 127, 231, 316, 269, 288, 139, 168, 46, 238, 317, 69, 211, 123, 391, 330, 295, 322, 329, 129, 240, 153, 267, 85, 300, 20, 191, 56, 370, 331] diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k600.json b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json new file mode 100644 index 0000000000..f0d3b1b0e9 --- /dev/null +++ b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json @@ -0,0 +1 @@ +[341, 661, 611, 158, 694, 189, 16, 398, 637, 302, 617, 202, 318, 639, 80, 584, 323, 618, 315, 88, 365, 52, 662, 674, 589, 257, 103, 113, 162, 75, 338, 388, 352, 603, 545, 308, 125, 159, 82, 10, 579, 44, 643, 591, 92, 396, 185, 258, 383, 660, 644, 178, 71, 260, 15, 522, 629, 335, 709, 705, 192, 599, 326, 621, 595, 58, 133, 689, 502, 504, 172, 120, 696, 334, 702, 280, 306, 101, 337, 173, 682, 203, 356, 4, 209, 505, 529, 514, 652, 708, 332, 548, 7, 65, 115, 81, 232, 344, 303, 201, 342, 351, 165, 656, 521, 397, 563, 368, 285, 244, 569, 688, 363, 355, 597, 512, 79, 268, 576, 110, 343, 636, 585, 72, 641, 219, 496, 321, 208, 345, 340, 84, 61, 206, 188, 649, 55, 586, 29, 237, 547, 2, 286, 567, 245, 90, 8, 372, 226, 274, 346, 693, 354, 97, 508, 28, 692, 246, 194, 212, 26, 281, 147, 215, 264, 30, 14, 301, 677, 66, 265, 224, 506, 627, 104, 121, 357, 517, 686, 54, 564, 107, 554, 279, 524, 109, 122, 289, 78, 59, 241, 559, 349, 571, 142, 152, 680, 220, 311, 386, 622, 145, 239, 392, 99, 266, 620, 640, 100, 176, 314, 167, 646, 64, 160, 216, 679, 49, 207, 657, 222, 647, 184, 171, 22, 234, 148, 339, 588, 18, 704, 218, 673, 294, 500, 324, 233, 262, 9, 377, 577, 41, 632, 390, 681, 53, 150, 361, 73, 247, 96, 630, 60, 494, 364, 659, 495, 650, 501, 552, 543, 519, 555, 298, 672, 560, 581, 70, 395, 143, 609, 499, 561, 568, 336, 573, 196, 385, 33, 144, 236, 1, 549, 307, 393, 256, 544, 263, 375, 675, 235, 654, 273, 638, 243, 106, 648, 539, 366, 271, 526, 186, 698, 532, 550, 287, 51, 299, 175, 276, 701, 369, 57, 179, 11, 373, 655, 666, 35, 593, 513, 580, 687, 163, 297, 195, 399, 290, 382, 319, 678, 695, 40, 310, 223, 151, 270, 3, 707, 387, 531, 137, 535, 31, 658, 309, 558, 217, 17, 374, 190, 277, 605, 525, 697, 676, 327, 542, 572, 135, 394, 615, 50, 523, 665, 284, 671, 177, 515, 67, 574, 379, 141, 353, 108, 37, 136, 197, 533, 272, 562, 21, 492, 614, 498, 608, 312, 213, 164, 182, 250, 91, 89, 253, 199, 540, 333, 700, 503, 634, 556, 590, 594, 635, 683, 248, 63, 119, 507, 0, 130, 102, 32, 362, 296, 23, 619, 47, 156, 706, 596, 180, 183, 313, 5, 528, 607, 350, 389, 328, 112, 551, 557, 93, 553, 685, 378, 536, 359, 537, 83, 282, 625, 174, 371, 48, 360, 24, 691, 376, 68, 42, 598, 221, 140, 602, 118, 642, 116, 381, 94, 325, 77, 27, 45, 230, 87, 292, 76, 497, 39, 169, 131, 19, 510, 604, 193, 126, 367, 592, 105, 114, 210, 305, 149, 98, 259, 582, 200, 12, 254, 570, 146, 623, 601, 534, 278, 242, 261, 36, 703, 251, 214, 25, 304, 204, 157, 587, 255, 669, 229, 283, 518, 690, 610, 128, 538, 170, 86, 74, 138, 6, 198, 624, 384, 187, 530, 155, 348, 154, 699, 628, 493, 578, 166, 663, 653, 509, 124, 205, 13, 34, 225, 613, 43, 347, 670, 228, 358, 38, 631, 127, 231, 565, 541, 612, 664, 566, 651, 600, 511, 645, 616, 269, 288, 520, 575, 606, 626, 168, 668, 46, 546, 238, 317, 69, 211, 583, 123, 391, 330, 527, 295, 322, 329, 129, 240, 516, 153, 267, 85, 667, 633, 300, 20, 191, 684, 56, 370, 331] diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k700.json b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json new file mode 100644 index 0000000000..784fa00f71 --- /dev/null +++ b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json @@ -0,0 +1 @@ +[341, 661, 611, 158, 694, 189, 16, 398, 637, 302, 617, 202, 318, 447, 639, 80, 584, 323, 249, 618, 315, 88, 365, 52, 662, 674, 589, 257, 103, 453, 477, 113, 162, 75, 338, 388, 352, 439, 603, 545, 308, 125, 159, 82, 10, 579, 44, 643, 484, 591, 92, 396, 185, 258, 383, 660, 435, 644, 178, 419, 71, 260, 15, 522, 629, 335, 705, 192, 599, 326, 621, 595, 58, 133, 689, 502, 504, 172, 436, 120, 696, 450, 334, 431, 702, 280, 306, 101, 337, 173, 682, 203, 356, 475, 4, 458, 209, 505, 529, 514, 652, 332, 548, 7, 65, 115, 81, 232, 344, 303, 201, 342, 351, 165, 403, 656, 521, 397, 563, 252, 368, 285, 244, 569, 688, 406, 363, 355, 597, 512, 79, 268, 470, 576, 110, 343, 636, 585, 418, 72, 641, 451, 219, 496, 321, 208, 345, 340, 84, 61, 206, 415, 188, 479, 649, 62, 55, 586, 29, 237, 547, 2, 286, 567, 245, 90, 405, 8, 372, 226, 274, 346, 693, 354, 97, 508, 28, 692, 246, 194, 212, 26, 281, 147, 215, 264, 409, 30, 14, 301, 677, 402, 275, 66, 265, 224, 506, 627, 104, 121, 357, 517, 686, 456, 117, 54, 564, 107, 554, 445, 279, 524, 109, 122, 289, 78, 59, 241, 291, 559, 349, 571, 142, 152, 680, 220, 311, 386, 622, 145, 422, 239, 392, 99, 266, 620, 640, 100, 176, 404, 486, 473, 314, 167, 646, 64, 160, 216, 679, 49, 207, 657, 222, 647, 184, 171, 22, 234, 148, 339, 588, 18, 704, 218, 673, 294, 500, 324, 233, 262, 9, 377, 577, 41, 632, 467, 390, 681, 53, 150, 361, 73, 247, 96, 630, 60, 494, 364, 659, 460, 495, 650, 501, 434, 552, 543, 468, 519, 448, 555, 298, 672, 560, 466, 581, 70, 395, 143, 609, 499, 561, 568, 336, 481, 573, 196, 442, 385, 33, 144, 236, 1, 549, 307, 393, 256, 544, 263, 490, 375, 488, 437, 675, 235, 654, 273, 638, 438, 424, 243, 106, 648, 539, 366, 271, 427, 526, 186, 698, 532, 550, 287, 51, 299, 175, 276, 701, 369, 408, 57, 179, 11, 373, 454, 655, 666, 35, 429, 593, 513, 580, 687, 163, 297, 195, 421, 399, 290, 382, 319, 678, 446, 695, 134, 40, 423, 310, 223, 151, 270, 3, 707, 387, 531, 137, 535, 31, 658, 309, 558, 217, 17, 374, 190, 277, 605, 525, 485, 697, 676, 327, 542, 401, 483, 572, 135, 394, 615, 50, 471, 523, 665, 284, 671, 177, 430, 465, 515, 67, 574, 474, 491, 379, 141, 353, 108, 37, 136, 197, 533, 272, 400, 562, 21, 413, 492, 614, 498, 440, 462, 608, 312, 463, 213, 420, 476, 164, 182, 250, 91, 89, 253, 199, 540, 333, 700, 503, 634, 556, 590, 594, 635, 416, 683, 248, 63, 119, 507, 0, 130, 102, 32, 362, 296, 23, 619, 47, 156, 706, 596, 180, 183, 313, 5, 428, 528, 607, 350, 389, 328, 433, 112, 478, 551, 557, 93, 553, 685, 378, 407, 536, 359, 537, 83, 282, 625, 174, 371, 48, 360, 24, 691, 376, 452, 68, 42, 461, 598, 221, 411, 140, 181, 602, 118, 642, 116, 443, 381, 412, 94, 325, 77, 27, 482, 45, 230, 87, 292, 76, 497, 39, 169, 131, 19, 510, 432, 604, 193, 126, 367, 592, 105, 114, 210, 305, 149, 98, 259, 582, 449, 200, 455, 12, 320, 254, 570, 146, 426, 425, 457, 623, 601, 534, 464, 278, 242, 261, 36, 703, 251, 214, 441, 25, 304, 204, 157, 587, 489, 487, 255, 669, 229, 283, 518, 690, 610, 128, 414, 538, 170, 86, 74, 138, 6, 198, 624, 384, 187, 530, 155, 348, 154, 699, 628, 493, 578, 166, 663, 653, 509, 124, 205, 132, 13, 34, 459, 225, 613, 43, 347, 670, 228, 358, 38, 631, 127, 417, 231, 565, 541, 612, 664, 566, 651, 600, 511, 645, 480, 616, 269, 288, 472, 520, 575, 606, 626, 168, 668, 469, 46, 546, 444, 238, 317, 69, 211, 583, 123, 391, 330, 527, 410, 295, 322, 329, 129, 240, 516, 153, 267, 85, 667, 633, 300, 20, 191, 684, 56, 370, 331] diff --git a/configs/recognition/uniformerv2/metafile.yml b/configs/recognition/uniformerv2/metafile.yml index acd35d3443..bf99abe094 100644 --- a/configs/recognition/uniformerv2/metafile.yml +++ b/configs/recognition/uniformerv2/metafile.yml @@ -6,26 +6,49 @@ Collections: Title: "UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer" Models: - - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb - Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py + - Name: uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py In Collection: UniFormer Metadata: Architecture: UniFormerV2-B/16 + Batch Size: 32 + Pretrained: CLIP-400M + Frame: 8 + Sampling method: Uniform + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs + Modality: RGB + Results: + - Dataset: Kinetics-400 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 84.3 + Top 5 Accuracy: 96.4 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth + + - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py + In Collection: UniFormer + Metadata: + Architecture: UniFormerV2-B/16 + Batch Size: 32 Pretrained: Kinetics-710 - Resolution: short-side 320 Frame: 8 Sampling method: Uniform + Resolution: 224x224 + Training Data: Kinetics-400 + Training Resources: 8 GPUs Modality: RGB - Converted From: - Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md - Code: https://github.com/OpenGVLab/UniFormerV2 Results: - Dataset: Kinetics-400 Task: Action Recognition Metrics: Top 1 Accuracy: 85.8 Top 5 Accuracy: 97.1 - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py @@ -33,7 +56,7 @@ Models: Metadata: Architecture: UniFormerV2-L/14 Pretrained: Kinetics-710 - Resolution: short-side 320 + Resolution: 224x224 Frame: 8 Sampling method: Uniform Modality: RGB @@ -54,7 +77,7 @@ Models: Metadata: Architecture: UniFormerV2-L/14 Pretrained: Kinetics-710 - Resolution: short-side 320 + Resolution: 224x224 Frame: 16 Sampling method: Uniform Modality: RGB @@ -75,7 +98,7 @@ Models: Metadata: Architecture: UniFormerV2-L/14 Pretrained: Kinetics-710 - Resolution: short-side 320 + Resolution: 224x224 Frame: 32 Sampling method: Uniform Modality: RGB @@ -96,7 +119,7 @@ Models: Metadata: Architecture: UniFormerV2-L/14@336 Pretrained: Kinetics-710 - Resolution: short-side 320 + Resolution: 224x224 Frame: 32 Sampling method: Uniform Modality: RGB @@ -111,14 +134,15 @@ Models: Top 5 Accuracy: 98.4 Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth - - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb - Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py + - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py In Collection: UniFormer Metadata: Architecture: UniFormerV2-B/16 Pretrained: Kinetics-710 Frame: 8 Sampling method: Uniform + Training Resources: 8 GPUs Modality: RGB Converted From: Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md @@ -129,7 +153,8 @@ Models: Metrics: Top 1 Accuracy: 86.4 Top 5 Accuracy: 97.3 - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-c62c4da4.pth + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py @@ -211,14 +236,15 @@ Models: Top 5 Accuracy: 98.5 Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth - - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb - Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py + - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py In Collection: UniFormer Metadata: Architecture: UniFormerV2-B/16 - Pretrained: Kinetics-710 + Pretrained: CLIP-400M Frame: 8 Sampling method: Uniform + Training Resources: 8 GPUs Modality: RGB Converted From: Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md @@ -229,7 +255,30 @@ Models: Metrics: Top 1 Accuracy: 76.3 Top 5 Accuracy: 92.9 - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-8a7c4ac4.pth + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth + + - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py + In Collection: UniFormer + Metadata: + Architecture: UniFormerV2-B/16 + Pretrained: Kinetics-710 + Frame: 8 + Sampling method: Uniform + Training Resources: 8 GPUs + Modality: RGB + Converted From: + Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md + Code: https://github.com/OpenGVLab/UniFormerV2 + Results: + - Dataset: Kinetics-700 + Task: Action Recognition + Metrics: + Top 1 Accuracy: 75.9 + Top 5 Accuracy: 92.9 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth - Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py @@ -353,14 +402,15 @@ Models: Code: https://github.com/OpenGVLab/UniFormerV2 Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20221219-55878cdc.pth - - Name: uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb - Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py + - Name: uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb + Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py In Collection: UniFormer Metadata: Architecture: UniFormerV2-B/16 Pretrained: Kinetics-710 + Kinetics-400 Frame: 8 Sampling method: Uniform + Training Resources: 16 GPUs Modality: RGB Converted From: Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md @@ -369,9 +419,10 @@ Models: - Dataset: Moments in Time V1 Task: Action Recognition Metrics: - Top 1 Accuracy: 42.7 - Top 5 Accuracy: 71.6 - Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-fddbc786.pth + Top 1 Accuracy: 42.3 + Top 5 Accuracy: 71.5 + Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log + Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth - Name: uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb Config: configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py index a4cae65831..a6e37c330a 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py @@ -23,7 +23,13 @@ n_head=12, mlp_factor=4., drop_path_rate=0., - mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth', # noqa: E501 + prefix='backbone.')), cls_head=dict( type='TimeSformerHead', dropout_ratio=0.5, @@ -38,11 +44,44 @@ # dataset settings dataset_type = 'VideoDataset' -data_root_val = 'data/mit_v1' -ann_file_test = 'data/mit_v1/val.csv' +data_root = 'data/mit/videos/training' +data_root_val = 'data/mit/videos/validation' +ann_file_train = 'data/mit/mit_train_list_videos.txt' +ann_file_val = 'data/mit/mit_val_list_videos.txt' +ann_file_test = 'data/mit/mit_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='UniformSample', clip_len=num_frames, num_clips=4, test_mode=True), @@ -53,8 +92,29 @@ dict(type='PackActionInputs') ] +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) test_dataloader = dict( - batch_size=32, + batch_size=8, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=False), @@ -63,8 +123,44 @@ ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, - test_mode=True, - delimiter=' ')) + test_mode=True)) +val_evaluator = dict(type='AccMetric') test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') + +base_lr = 2e-5 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=1 / 20, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + eta_min_ratio=1 / 20, + by_epoch=True, + begin=5, + end=24, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=512) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py index a3eddb0d04..4e47cabb84 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py @@ -23,13 +23,26 @@ n_head=12, mlp_factor=4., drop_path_rate=0., - mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='backbone.')), cls_head=dict( - type='TimeSformerHead', + type='UniFormerHead', dropout_ratio=0.5, num_classes=400, in_channels=768, - average_clips='prob'), + average_clips='prob', + channel_map= # noqa: E251 + 'configs/recognition/uniformerv2/k710_channel_map/map_k400.json', + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='cls_head.')), data_preprocessor=dict( type='ActionDataPreprocessor', mean=[114.75, 114.75, 114.75], @@ -38,11 +51,44 @@ # dataset settings dataset_type = 'VideoDataset' -data_root_val = 'data/k400' -ann_file_test = 'data/k400/val.csv' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='UniformSample', clip_len=num_frames, num_clips=4, test_mode=True), @@ -53,8 +99,29 @@ dict(type='PackActionInputs') ] +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) test_dataloader = dict( - batch_size=32, + batch_size=8, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=False), @@ -63,8 +130,45 @@ ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, - test_mode=True, - delimiter=',')) + test_mode=True)) +val_evaluator = dict(type='AccMetric') test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') + +base_lr = 2e-6 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.5, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=4, + eta_min_ratio=0.5, + by_epoch=True, + begin=1, + end=5, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py index 4c91589dbb..a9f6f61413 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py @@ -23,13 +23,26 @@ n_head=12, mlp_factor=4., drop_path_rate=0., - mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='backbone.')), cls_head=dict( - type='TimeSformerHead', + type='UniFormerHead', dropout_ratio=0.5, num_classes=600, in_channels=768, - average_clips='prob'), + average_clips='prob', + channel_map= # noqa: E251 + 'configs/recognition/uniformerv2/k710_channel_map/map_k600.json', + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='cls_head.')), data_preprocessor=dict( type='ActionDataPreprocessor', mean=[114.75, 114.75, 114.75], @@ -38,11 +51,44 @@ # dataset settings dataset_type = 'VideoDataset' -data_root_val = 'data/k600' -ann_file_test = 'data/k600/val.csv' +data_root = 'data/kinetics600/videos_train' +data_root_val = 'data/kinetics600/videos_val' +ann_file_train = 'data/kinetics600/kinetics600_train_list_videos.txt' +ann_file_val = 'data/kinetics600/kinetics600_val_list_videos.txt' +ann_file_test = 'data/kinetics600/kinetics600_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='UniformSample', clip_len=num_frames, num_clips=4, test_mode=True), @@ -53,8 +99,29 @@ dict(type='PackActionInputs') ] +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) test_dataloader = dict( - batch_size=32, + batch_size=8, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=False), @@ -63,8 +130,45 @@ ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, - test_mode=True, - delimiter=',')) + test_mode=True)) +val_evaluator = dict(type='AccMetric') test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') + +base_lr = 2e-6 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.5, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=4, + eta_min_ratio=0.5, + by_epoch=True, + begin=1, + end=5, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py index 92494df5d7..5c59ad46f4 100644 --- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py @@ -23,13 +23,26 @@ n_head=12, mlp_factor=4., drop_path_rate=0., - mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='backbone.')), cls_head=dict( - type='TimeSformerHead', + type='UniFormerHead', dropout_ratio=0.5, num_classes=700, in_channels=768, - average_clips='prob'), + average_clips='prob', + channel_map= # noqa: E251 + 'configs/recognition/uniformerv2/k710_channel_map/map_k700.json', + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='cls_head.')), data_preprocessor=dict( type='ActionDataPreprocessor', mean=[114.75, 114.75, 114.75], @@ -38,11 +51,44 @@ # dataset settings dataset_type = 'VideoDataset' -data_root_val = 'data/k700' -ann_file_test = 'data/k700/val.csv' +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] test_pipeline = [ - dict(type='DecordInit'), + dict(type='DecordInit', **file_client_args), dict( type='UniformSample', clip_len=num_frames, num_clips=4, test_mode=True), @@ -53,8 +99,29 @@ dict(type='PackActionInputs') ] +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) test_dataloader = dict( - batch_size=32, + batch_size=8, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=False), @@ -63,8 +130,45 @@ ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, - test_mode=True, - delimiter=',')) + test_mode=True)) +val_evaluator = dict(type='AccMetric') test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') + +base_lr = 2e-6 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.5, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=4, + eta_min_ratio=0.5, + by_epoch=True, + begin=1, + end=5, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics400-rgb.py new file mode 100644 index 0000000000..6e9c4f3908 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics400-rgb.py @@ -0,0 +1,163 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=True, + pretrained='ViT-B/16'), + cls_head=dict( + type='UniFormerHead', + dropout_ratio=0.5, + num_classes=400, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1e-5 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=50, + eta_min_ratio=0.1, + by_epoch=True, + begin=5, + end=55, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics700-rgb.py new file mode 100644 index 0000000000..4a5b41d8c7 --- /dev/null +++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics700-rgb.py @@ -0,0 +1,163 @@ +_base_ = ['../../_base_/default_runtime.py'] + +# model settings +num_frames = 8 +model = dict( + type='Recognizer3D', + backbone=dict( + type='UniFormerV2', + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=True, + pretrained='ViT-B/16'), + cls_head=dict( + type='UniFormerHead', + dropout_ratio=0.5, + num_classes=700, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type='ActionDataPreprocessor', + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict(type='UniformSample', clip_len=num_frames, num_clips=1), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='PytorchVideoWrapper', + op='RandAugment', + magnitude=7, + num_layers=4), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +val_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=1, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='CenterCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +test_pipeline = [ + dict(type='DecordInit', **file_client_args), + dict( + type='UniformSample', clip_len=num_frames, num_clips=4, + test_mode=True), + dict(type='DecordDecode'), + dict(type='Resize', scale=(-1, 224)), + dict(type='ThreeCrop', crop_size=224), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='PackActionInputs') +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type='AccMetric') +test_evaluator = dict(type='AccMetric') +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +base_lr = 1e-5 +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type='LinearLR', + start_factor=0.1, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=50, + eta_min_ratio=0.1, + by_epoch=True, + begin=5, + end=55, + convert_to_iter_based=True) +] + +default_hooks = dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/mmaction/models/backbones/uniformer.py b/mmaction/models/backbones/uniformer.py index 97ac6184c1..5773313778 100644 --- a/mmaction/models/backbones/uniformer.py +++ b/mmaction/models/backbones/uniformer.py @@ -495,7 +495,7 @@ class UniFormer(BaseModule): attn_drop_rate (float): Attention dropout rate. Defaults to 0.0. drop_path_rate (float): Stochastic depth rates. Defaults to 0.0. - clip_pretrained (bool): Whether to load pretrained CLIP visual encoder. + pretrained2d (bool): Whether to load pretrained from 2D model. Defaults to True. pretrained (str): Name of pretrained model. Defaults to None. @@ -519,7 +519,7 @@ def __init__( drop_rate: float = 0., attn_drop_rate: float = 0., drop_path_rate: float = 0., - clip_pretrained: bool = True, + pretrained2d: bool = True, pretrained: Optional[str] = None, init_cfg: Optional[Union[Dict, List[Dict]]] = [ dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), @@ -529,7 +529,7 @@ def __init__( super().__init__(init_cfg=init_cfg) self.pretrained = pretrained - self.clip_pretrained = clip_pretrained + self.pretrained2d = pretrained2d self.patch_embed1 = SpeicalPatchEmbed( img_size=img_size, patch_size=4, @@ -641,7 +641,7 @@ def _load_pretrained(self, pretrained: str = None) -> None: def init_weights(self): """Initialize the weights in backbone.""" - if self.clip_pretrained: + if self.pretrained2d: logger = MMLogger.get_current_instance() logger.info(f'load model from: {self.pretrained}') self._load_pretrained(self.pretrained) diff --git a/mmaction/models/backbones/uniformerv2.py b/mmaction/models/backbones/uniformerv2.py index 64b0ba8faf..14571af5bd 100644 --- a/mmaction/models/backbones/uniformerv2.py +++ b/mmaction/models/backbones/uniformerv2.py @@ -548,23 +548,24 @@ def _load_pretrained(self, pretrained: str = None) -> None: pretrained (str): Model name of pretrained CLIP visual encoder. Defaults to None. """ - if pretrained is not None: - model_path = _MODELS[pretrained] - logger.info(f'Load CLIP pretrained model from {model_path}') - state_dict = _load_checkpoint(model_path, map_location='cpu') - state_dict_3d = self.state_dict() - for k in state_dict.keys(): - if k in state_dict_3d.keys( - ) and state_dict[k].shape != state_dict_3d[k].shape: - if len(state_dict_3d[k].shape) <= 2: - logger.info(f'Ignore: {k}') - continue - logger.info(f'Inflate: {k}, {state_dict[k].shape}' + - f' => {state_dict_3d[k].shape}') - time_dim = state_dict_3d[k].shape[2] - state_dict[k] = self._inflate_weight( - state_dict[k], time_dim) - self.load_state_dict(state_dict, strict=False) + assert pretrained is not None, \ + 'please specify clip pretraied checkpoint' + + model_path = _MODELS[pretrained] + logger.info(f'Load CLIP pretrained model from {model_path}') + state_dict = _load_checkpoint(model_path, map_location='cpu') + state_dict_3d = self.state_dict() + for k in state_dict.keys(): + if k in state_dict_3d.keys( + ) and state_dict[k].shape != state_dict_3d[k].shape: + if len(state_dict_3d[k].shape) <= 2: + logger.info(f'Ignore: {k}') + continue + logger.info(f'Inflate: {k}, {state_dict[k].shape}' + + f' => {state_dict_3d[k].shape}') + time_dim = state_dict_3d[k].shape[2] + state_dict[k] = self._inflate_weight(state_dict[k], time_dim) + self.load_state_dict(state_dict, strict=False) def init_weights(self): """Initialize the weights in backbone.""" diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py index 4cc8d20a4d..5a1b74a9f8 100644 --- a/mmaction/models/heads/__init__.py +++ b/mmaction/models/heads/__init__.py @@ -12,10 +12,11 @@ from .tsm_head import TSMHead from .tsn_audio_head import TSNAudioHead from .tsn_head import TSNHead +from .uniformer_head import UniFormerHead from .x3d_head import X3DHead __all__ = [ 'BaseHead', 'GCNHead', 'I3DHead', 'MViTHead', 'OmniHead', 'SlowFastHead', 'TPNHead', 'TRNHead', 'TSMHead', 'TSNAudioHead', 'TSNHead', - 'TimeSformerHead', 'X3DHead', 'RGBPoseHead' + 'TimeSformerHead', 'UniFormerHead', 'RGBPoseHead', 'X3DHead' ] diff --git a/mmaction/models/heads/uniformer_head.py b/mmaction/models/heads/uniformer_head.py new file mode 100644 index 0000000000..e83b552b93 --- /dev/null +++ b/mmaction/models/heads/uniformer_head.py @@ -0,0 +1,98 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional + +from mmengine.fileio import load +from mmengine.logging import MMLogger +from mmengine.runner.checkpoint import _load_checkpoint_with_prefix +from torch import Tensor, nn + +from mmaction.registry import MODELS +from mmaction.utils import ConfigType +from .base import BaseHead + + +@MODELS.register_module() +class UniFormerHead(BaseHead): + """Classification head for UniFormer. supports loading pretrained + Kinetics-710 checkpoint to fine-tuning on other Kinetics dataset. + + A pytorch implement of: `UniFormerV2: Spatiotemporal + Learning by Arming Image ViTs with Video UniFormer + ` + + Args: + num_classes (int): Number of classes to be classified. + in_channels (int): Number of channels in input feature. + loss_cls (dict or ConfigDict): Config for building loss. + Defaults to `dict(type='CrossEntropyLoss')`. + dropout_ratio (float): Probability of dropout layer. + Defaults to : 0.0. + channel_map (str, optional): Channel map file to selecting + channels from pretrained head with extra channels. + Defaults to None. + init_cfg (dict or ConfigDict, optional): Config to control the + initialization. Defaults to + ``[ + dict(type='TruncNormal', layer='Linear', std=0.01) + ]``. + kwargs (dict, optional): Any keyword argument to be used to initialize + the head. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + loss_cls: ConfigType = dict(type='CrossEntropyLoss'), + dropout_ratio: float = 0.0, + channel_map: Optional[str] = None, + init_cfg: Optional[dict] = dict( + type='TruncNormal', layer='Linear', std=0.02), + **kwargs) -> None: + super().__init__( + num_classes, in_channels, loss_cls, init_cfg=init_cfg, **kwargs) + self.channel_map = channel_map + self.dropout_ratio = dropout_ratio + + if self.dropout_ratio != 0: + self.dropout = nn.Dropout(p=self.dropout_ratio) + else: + self.dropout = None + self.fc_cls = nn.Linear(self.in_channels, self.num_classes) + + def _select_channels(self, stact_dict): + selected_channels = load(self.channel_map) + for key in stact_dict: + stact_dict[key] = stact_dict[key][selected_channels] + + def init_weights(self) -> None: + """Initiate the parameters from scratch.""" + if self.init_cfg['type'] == 'Pretrained': + assert self.channel_map is not None, \ + 'load cls_head weights needs to specify the channel map file' + logger = MMLogger.get_current_instance() + pretrained = self.init_cfg['checkpoint'] + logger.info(f'load pretrained model from {pretrained}') + state_dict = _load_checkpoint_with_prefix( + 'cls_head.', pretrained, map_location='cpu') + self._select_channels(state_dict) + msg = self.load_state_dict(state_dict, strict=False) + logger.info(msg) + else: + super().init_weights() + + def forward(self, x: Tensor, **kwargs) -> Tensor: + """Defines the computation performed at every call. + + Args: + x (Tensor): The input data. + + Returns: + Tensor: The classification scores for input samples. + """ + # [N, in_channels] + if self.dropout is not None: + x = self.dropout(x) + # [N, in_channels] + cls_score = self.fc_cls(x) + # [N, num_classes] + return cls_score diff --git a/tests/models/backbones/test_uniformerv2.py b/tests/models/backbones/test_uniformerv2.py index 3345892eb7..4858001c4d 100644 --- a/tests/models/backbones/test_uniformerv2.py +++ b/tests/models/backbones/test_uniformerv2.py @@ -28,6 +28,7 @@ def test_uniformerv2_backbone(): n_head=12, mlp_factor=4., drop_path_rate=0., + clip_pretrained=False, mlp_dropout=[0.5, 0.5, 0.5, 0.5]) model.init_weights() @@ -56,6 +57,7 @@ def test_uniformerv2_backbone(): n_head=12, mlp_factor=4., drop_path_rate=0., + clip_pretrained=False, mlp_dropout=[0.5, 0.5, 0.5, 0.5]) model.init_weights() diff --git a/tests/models/utils/test_gradcam.py b/tests/models/utils/test_gradcam.py index d1a39ef87c..4cc8b1b8b0 100644 --- a/tests/models/utils/test_gradcam.py +++ b/tests/models/utils/test_gradcam.py @@ -167,7 +167,7 @@ def test_csn(): recognizer = MODELS.build(config.model) recognizer.cfg = config - input_shape = (1, 1, 3, 32, 32, 32) + input_shape = (1, 1, 3, 32, 16, 16) target_layer_name = 'backbone/layer4/1/relu' _do_test_3D_models(recognizer, target_layer_name, input_shape) @@ -230,6 +230,6 @@ def test_x3d(): config.model['backbone']['pretrained'] = None recognizer = MODELS.build(config.model) recognizer.cfg = config - input_shape = (1, 1, 3, 13, 32, 32) + input_shape = (1, 1, 3, 13, 16, 16) target_layer_name = 'backbone/layer4/1/relu' _do_test_3D_models(recognizer, target_layer_name, input_shape) diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py index e0886162a6..eeeba0d402 100644 --- a/tests/utils/test_misc.py +++ b/tests/utils/test_misc.py @@ -1,11 +1,15 @@ # Copyright (c) OpenMMLab. All rights reserved. import os import os.path as osp +import platform from tempfile import TemporaryDirectory +import pytest + from mmaction.utils import frame_extract +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') def test_frame_extract(): data_prefix = osp.normpath(osp.join(osp.dirname(__file__), '../data')) video_path = osp.join(data_prefix, 'test.mp4') diff --git a/tests/visualization/test_action_visualizer.py b/tests/visualization/test_action_visualizer.py index 3c7a1db59d..c86b324af9 100644 --- a/tests/visualization/test_action_visualizer.py +++ b/tests/visualization/test_action_visualizer.py @@ -1,5 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. +import platform + import decord +import pytest import torch from mmengine.structures import LabelData @@ -7,6 +10,7 @@ from mmaction.visualization import ActionVisualizer +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') def test_visualizer(): video = decord.VideoReader('./demo/demo.mp4') video = video.get_batch(range(32)).asnumpy() diff --git a/tests/visualization/test_video_backend.py b/tests/visualization/test_video_backend.py index 0de82465ee..c5153d812d 100644 --- a/tests/visualization/test_video_backend.py +++ b/tests/visualization/test_video_backend.py @@ -1,11 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. import os import os.path as osp +import platform import time from pathlib import Path from tempfile import TemporaryDirectory import decord +import pytest import torch from mmengine.structures import LabelData @@ -16,6 +18,7 @@ register_all_modules() +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') def test_local_visbackend(): video = decord.VideoReader('./demo/demo.mp4') video = video.get_batch(range(32)).asnumpy() @@ -37,6 +40,7 @@ def test_local_visbackend(): return +@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit') def test_tensorboard_visbackend(): video = decord.VideoReader('./demo/demo.mp4') video = video.get_batch(range(32)).asnumpy() From 5f78a1167461434cc3f68b9c37dc0fd4e9d5ca91 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Thu, 6 Apr 2023 20:46:44 +0800 Subject: [PATCH 34/36] [doc] fix docs conflicts (#2342) --- .circleci/test.yml | 4 +- .github/workflows/merge_stage_test.yml | 8 +- .github/workflows/pr_stage_test.yml | 6 +- README.md | 37 +++--- README_zh-CN.md | 16 +-- docker/Dockerfile | 2 +- docs/en/advanced_guides/dataflow.md | 2 + docs/en/advanced_guides/depoly.md | 3 + docs/en/api.rst | 140 ++++++++++++++++++++++ docs/en/conf.py | 1 + docs/en/get_started/faq.md | 2 +- docs/en/get_started/guide_to_framework.md | 2 +- docs/en/get_started/installation.md | 6 +- docs/en/get_started/overview.md | 38 +++--- docs/en/index.rst | 6 + docs/en/merge_docs.sh | 17 +-- docs/en/migration.md | 4 +- docs/en/switch_language.md | 2 +- docs/zh_cn/get_started.md | 6 +- docs/zh_cn/switch_language.md | 2 +- projects/ctrgcn/README.md | 2 +- projects/example_project/README.md | 2 +- projects/msg3d/README.md | 2 +- tests/models/utils/test_gradcam.py | 2 +- 24 files changed, 234 insertions(+), 78 deletions(-) create mode 100644 docs/en/api.rst diff --git a/.circleci/test.yml b/.circleci/test.yml index aafba494dd..8ead3de0a8 100644 --- a/.circleci/test.yml +++ b/.circleci/test.yml @@ -63,7 +63,7 @@ jobs: command: | pip install git+ssh://git@github.com/open-mmlab/mmengine.git@main pip install -U openmim - mim install 'mmcv >= 2.0.0rc1' + mim install 'mmcv >= 2.0.0' pip install git+ssh://git@github.com/open-mmlab/mmdetection.git@dev-3.x pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x pip install -r requirements.txt @@ -122,7 +122,7 @@ jobs: command: | docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmengine.git@main docker exec mmaction pip install -U openmim - docker exec mmaction mim install 'mmcv >= 2.0.0rc1' + docker exec mmaction mim install 'mmcv >= 2.0.0' docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmdetection.git@dev-3.x docker exec mmaction pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x docker exec mmaction pip install -r requirements.txt diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml index cf1f2ed10c..60df0a1245 100644 --- a/.github/workflows/merge_stage_test.yml +++ b/.github/workflows/merge_stage_test.yml @@ -55,7 +55,7 @@ jobs: - name: Install MMCV run: | pip install -U openmim - mim install 'mmcv >= 2.0.0rc1' + mim install 'mmcv >= 2.0.0' - name: Install MMDet run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x - name: Install MMCls @@ -123,7 +123,7 @@ jobs: - name: Install MMCV run: | pip install -U openmim - mim install 'mmcv >= 2.0.0rc1' + mim install 'mmcv >= 2.0.0' - name: Install MMDet run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x - name: Install MMCls @@ -187,7 +187,7 @@ jobs: run: | pip install git+https://github.com/open-mmlab/mmengine.git@main pip install -U openmim - mim install 'mmcv >= 2.0.0rc1' + mim install 'mmcv >= 2.0.0' pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x pip install -r requirements.txt @@ -225,7 +225,7 @@ jobs: run: | pip install git+https://github.com/open-mmlab/mmengine.git@main pip install -U openmim - mim install 'mmcv >= 2.0.0rc1' + mim install 'mmcv >= 2.0.0' pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x pip install -r requirements.txt diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml index a0eb9d5d00..a8b5c4c7a2 100644 --- a/.github/workflows/pr_stage_test.yml +++ b/.github/workflows/pr_stage_test.yml @@ -46,7 +46,7 @@ jobs: - name: Install MMCV run: | pip install -U openmim - mim install 'mmcv >= 2.0.0rc1' + mim install 'mmcv >= 2.0.0' - name: Install MMDet run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x - name: Install MMCls @@ -114,7 +114,7 @@ jobs: run: | pip install git+https://github.com/open-mmlab/mmengine.git@main pip install -U openmim - mim install 'mmcv >= 2.0.0rc1' + mim install 'mmcv >= 2.0.0' pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x pip install -r requirements.txt @@ -161,7 +161,7 @@ jobs: run: | pip install git+https://github.com/open-mmlab/mmengine.git@main pip install -U openmim - mim install 'mmcv >= 2.0.0rc1' + mim install 'mmcv >= 2.0.0' pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x pip install -r requirements.txt diff --git a/README.md b/README.md index 25b703306c..f01bf8cd5f 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ -[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/1.x/) +[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/latest/) [![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions) [![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2) [![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/) @@ -25,10 +25,10 @@ [![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues) [![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues) -[📘Documentation](https://mmaction2.readthedocs.io/en/1.x/) | -[🛠️Installation](https://mmaction2.readthedocs.io/en/1.x/get_started.html) | -[👀Model Zoo](https://mmaction2.readthedocs.io/en/1.x/modelzoo.html) | -[🆕Update News](https://mmaction2.readthedocs.io/en/1.x/notes/changelog.html) | +[📘Documentation](https://mmaction2.readthedocs.io/en/latest/) | +[🛠️Installation](https://mmaction2.readthedocs.io/en/latest/get_started.html) | +[👀Model Zoo](https://mmaction2.readthedocs.io/en/latest/modelzoo.html) | +[🆕Update News](https://mmaction2.readthedocs.io/en/latest/notes/changelog.html) | [🚀Ongoing Projects](https://github.com/open-mmlab/mmaction2/projects) | [🤔Reporting Issues](https://github.com/open-mmlab/mmaction2/issues/new/choose) @@ -58,21 +58,22 @@ English | [简体中文](/README_zh-CN.md) ## 📄 Table of Contents +- [📄 Table of Contents](#-table-of-contents) - [🥳 🚀 What's New](#--whats-new-) - [📖 Introduction](#-introduction-) - [🎁 Major Features](#-major-features-) -- [🛠️ Installation](#-installation-) +- [🛠️ Installation](#️-installation-) - [👀 Model Zoo](#-model-zoo-) - [👨‍🏫 Get Started](#-get-started-) - [🎫 License](#-license-) - [🖊️ Citation](#️-citation-) - [🙌 Contributing](#-contributing-) - [🤝 Acknowledgement](#-acknowledgement-) -- [🏗️ Projects in OpenMMLab](#-projects-in-openmmlab-) +- [🏗️ Projects in OpenMMLab](#️-projects-in-openmmlab-) ## 🥳 🚀 What's New [🔝](#-table-of-contents) -**The default branch has been switched to `1.x` from `master`, and we encourage users to migrate to the latest version with more supported models, stronger pre-training checkpoints and simpler coding. Please refer to [Migration Guide](https://mmaction2.readthedocs.io/en/1.x/migration.html) for more details.** +**The default branch has been switched to `1.x` from `master`, and we encourage users to migrate to the latest version with more supported models, stronger pre-training checkpoints and simpler coding. Please refer to [Migration Guide](https://mmaction2.readthedocs.io/en/latest/migration.html) for more details.** **Release (2023.02.10)**: v1.0.0rc3 with the following new features: @@ -112,7 +113,7 @@ It is a part of the [OpenMMLab](http://openmmlab.com/) project. MMAction2 depends on [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine), [MMDetection](https://github.com/open-mmlab/mmdetection) (optional, for spatial-temporal detection tasks) and [MMPose](https://github.com/open-mmlab/mmpose) (optional, for skeleton based tasks). -Please refer to [install.md](https://mmaction2.readthedocs.io/en/1.x/get_started.html) for detailed instructions. +Please refer to [install.md](https://mmaction2.readthedocs.io/en/latest/get_started.html) for detailed instructions.
Quick instructions @@ -122,9 +123,9 @@ conda create --name openmmlab python=3.8 -y conda activate open-mmlab conda install pytorch torchvision -c pytorch # This command will automatically install the latest version PyTorch and cudatoolkit, please check whether they match your environment. pip install -U openmim -mim install mmengine 'mmcv>=2.0.0rc1' -mim install "mmdet>=3.0.0rc5" # optional -mim install "mmpose>=1.0.0rc0" # optional +mim install mmengine 'mmcv>=2.0.0' +mim install "mmdet>=3.0.0" # optional +mim install "mmpose>=1.0.0" # optional git clone https://github.com/open-mmlab/mmaction2.git cd mmaction2 git checkout 1.x @@ -135,7 +136,7 @@ pip3 install -e . ## 👀 Model Zoo [🔝](#-table-of-contents) -Results and models are available in the [model zoo](https://mmaction2.readthedocs.io/en/1.x/modelzoo.html). +Results and models are available in the [model zoo](https://mmaction2.readthedocs.io/en/latest/modelzoo.html).
@@ -284,11 +285,11 @@ Results and models are available in the [model zoo](https://mmaction2.readthedoc For tutorials, we provide the following user guides for basic usage: -- [Migration from MMAction2 0.X](https://mmaction2.readthedocs.io/en/1.x/migration.html) -- [Learn about Configs](https://mmaction2.readthedocs.io/en/1.x/user_guides/1_config.html#) -- [Prepare Datasets](https://mmaction2.readthedocs.io/en/1.x/user_guides/2_data_prepare.html) -- [Inference with Existing Models](https://mmaction2.readthedocs.io/en/1.x/user_guides/3_inference.html) -- [Training and Testing](https://mmaction2.readthedocs.io/en/1.x/user_guides/4_train_test.html) +- [Migration from MMAction2 0.X](https://mmaction2.readthedocs.io/en/latest/migration.html) +- [Learn about Configs](https://mmaction2.readthedocs.io/en/latest/user_guides/1_config.html#) +- [Prepare Datasets](https://mmaction2.readthedocs.io/en/latest/user_guides/2_data_prepare.html) +- [Inference with Existing Models](https://mmaction2.readthedocs.io/en/latest/user_guides/3_inference.html) +- [Training and Testing](https://mmaction2.readthedocs.io/en/latest/user_guides/4_train_test.html)
Research works built on MMAction2 by users from community diff --git a/README_zh-CN.md b/README_zh-CN.md index c2ffb09702..5b03609b9d 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -17,7 +17,7 @@ -[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/1.x/) +[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/latest/) [![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions) [![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2) [![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/) @@ -25,9 +25,9 @@ [![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues) [![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues) -[📘文档](https://mmaction2.readthedocs.io/zh_CN//1.x/) | -[🛠️安装指南](https://mmaction2.readthedocs.io/zh_CN//1.x/get_started.html) | -[👀模型库](https://mmaction2.readthedocs.io/zh_CN//1.x/modelzoo.html) | +[📘文档](https://mmaction2.readthedocs.io/zh_CN/1.x/) | +[🛠️安装指南](https://mmaction2.readthedocs.io/zh_CN/1.x/get_started.html) | +[👀模型库](https://mmaction2.readthedocs.io/zh_CN/1.x/modelzoo.html) | [🆕更新](https://mmaction2.readthedocs.io/zh_CN/1.x/notes/changelog.html) | [🚀进行中项目](https://github.com/open-mmlab/mmaction2/projects) | [🤔问题反馈](https://github.com/open-mmlab/mmaction2/issues/new/choose) @@ -107,9 +107,9 @@ conda create --name openmmlab python=3.8 -y conda activate open-mmlab conda install pytorch torchvision -c pytorch # 以上命令将自动安装最新版本的 PyTorch 和 cudatoolkit,请检查它们是否和你的环境匹配 pip install -U openmim -mim install mmengine 'mmcv>=2.0.0rc1' -mim install "mmdet>=3.0.0rc5" # 可选 -mim install "mmpose>=1.0.0rc0" # 可选 +mim install mmengine 'mmcv>=2.0.0' +mim install "mmdet>=3.0.0" # 可选 +mim install "mmpose>=1.0.0" # 可选 git clone https://github.com/open-mmlab/mmaction2.git cd mmaction2 git checkout 1.x @@ -256,7 +256,7 @@ MMAction2 将跟进学界的最新进展,并支持更多算法和框架。如 ## 数据集准备 -请参考 [数据准备](https://mmaction2.readthedocs.io/en/1.x/user_guides/2_data_prepare.html) 了解数据集准备概况。所有支持的数据集都列于 [数据集清单](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 中。 +请参考 [数据准备](https://mmaction2.readthedocs.io/en/latest/user_guides/2_data_prepare.html) 了解数据集准备概况。所有支持的数据集都列于 [数据集清单](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 中。 ## FAQ diff --git a/docker/Dockerfile b/docker/Dockerfile index 45c82cfcb7..6622f147ea 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -17,7 +17,7 @@ RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 lib # Install MMCV RUN pip install openmim -RUN mim install mmengine "mmcv>=2.0rc1" +RUN mim install mmengine "mmcv>=2.0.0" # Install MMAction2 RUN conda clean --all diff --git a/docs/en/advanced_guides/dataflow.md b/docs/en/advanced_guides/dataflow.md index 0cc136162a..5723cc1557 100644 --- a/docs/en/advanced_guides/dataflow.md +++ b/docs/en/advanced_guides/dataflow.md @@ -1 +1,3 @@ # Dataflow in MMAction2 + +coming soon... diff --git a/docs/en/advanced_guides/depoly.md b/docs/en/advanced_guides/depoly.md index e69de29bb2..58e9f58ea4 100644 --- a/docs/en/advanced_guides/depoly.md +++ b/docs/en/advanced_guides/depoly.md @@ -0,0 +1,3 @@ +# How to deploy MMAction2 models + +coming soon... diff --git a/docs/en/api.rst b/docs/en/api.rst new file mode 100644 index 0000000000..4431c7734b --- /dev/null +++ b/docs/en/api.rst @@ -0,0 +1,140 @@ +mmaction.apis +-------------- +.. automodule:: mmaction.apis + :members: + +mmaction.datasets +-------------- + +datasets +^^^^^^^^^^ +.. automodule:: mmaction.datasets + :members: + +transforms +^^^^^^^^^^^^ +.. automodule:: mmaction.datasets.transforms + :members: + +mmaction.engine +-------------- + +hooks +^^^^^^^^^^ +.. automodule:: mmaction.engine.hooks + :members: + +optimizers +^^^^^^^^^^^^^^^ +.. automodule:: mmaction.engine.optimizers + :members: + +runner +^^^^^^^^^^ +.. automodule:: mmaction.engine.runner + :members: + + +mmaction.evaluation +-------------------- + +functional +^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.evaluation.functional + :members: + +metrics +^^^^^^^^^^ +.. automodule:: mmaction.evaluation.metrics + :members: + + +mmaction.models +-------------- + +backbones +^^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.backbones + :members: + +common +^^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.common + :members: + +data_preprocessors +^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.data_preprocessors + :members: + +heads +^^^^^^^^^^^^^^^ +.. automodule:: mmaction.models.heads + :members: + +localizers +^^^^^^^^^^ +.. automodule:: mmaction.models.localizers + :members: + + +losses +^^^^^^^^^^ +.. automodule:: mmaction.models.losses + :members: + +necks +^^^^^^^^^^^^ +.. automodule:: mmaction.models.necks + :members: + +roi_heads +^^^^^^^^^^^^^ +.. automodule:: mmaction.models.roi_heads + :members: + +recognizers +^^^^^^^^^^^^^ +.. automodule:: mmaction.models.seg_heads + :members: + +task_modules +^^^^^^^^^^^^^ +.. automodule:: mmaction.models.task_modules + :members: + + +utils +^^^^^^^^^^ +.. automodule:: mmaction.models.utils + :members: + + +mmaction.structures +-------------------- + +structures +^^^^^^^^^^^^^^^^^ +.. automodule:: mmaction.structures + :members: + +bbox +^^^^^^^^^^ +.. automodule:: mmaction.structures.bbox + :members: + + +mmaction.testing +---------------- +.. automodule:: mmaction.testing + :members: + +mmaction.visualization +-------------------- +.. automodule:: mmaction.visualization + :members: + +mmaction.utils +-------------- +.. automodule:: mmaction.utils + :members: diff --git a/docs/en/conf.py b/docs/en/conf.py index 6ff7f10029..ba54a05953 100644 --- a/docs/en/conf.py +++ b/docs/en/conf.py @@ -124,6 +124,7 @@ def get_version(): html_css_files = ['css/readthedocs.css'] myst_enable_extensions = ['colon_fence'] +myst_heading_anchors = 3 def builder_inited_handler(app): diff --git a/docs/en/get_started/faq.md b/docs/en/get_started/faq.md index 2cbe7787b3..7ef9cdd53e 100644 --- a/docs/en/get_started/faq.md +++ b/docs/en/get_started/faq.md @@ -30,7 +30,7 @@ If the contents here do not cover your issue, please create an issue using the [ - **"Why I got the error message 'Please install XXCODEBASE to use XXX' even if I have already installed XXCODEBASE?"** - You got that error message because our project failed to import a function or a class from XXCODEBASE. You can try to run the corresponding line to see what happens. One possible reason is, for some codebases in OpenMMLAB, you need to install mmcv and mmengine before you install them. You could follow this [tutorial](https://mmaction2.readthedocs.io/en/1.x/get_started.html#installation) to install them. + You got that error message because our project failed to import a function or a class from XXCODEBASE. You can try to run the corresponding line to see what happens. One possible reason is, for some codebases in OpenMMLAB, you need to install mmcv and mmengine before you install them. You could follow this [tutorial](https://mmaction2.readthedocs.io/en/latest/get_started.html#installation) to install them. ## Data diff --git a/docs/en/get_started/guide_to_framework.md b/docs/en/get_started/guide_to_framework.md index 68f8bdfd41..ab66ba196f 100644 --- a/docs/en/get_started/guide_to_framework.md +++ b/docs/en/get_started/guide_to_framework.md @@ -738,7 +738,7 @@ for epoch in range(max_epochs): ## Step6: Train and Test with MMEngine (Recommended) -For more details on training and testing, you can refer to [MMAction2 Tutorial](https://mmaction2.readthedocs.io/en/1.x/user_guides/4_train_test.html). For more information on `Runner`, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html). +For more details on training and testing, you can refer to [MMAction2 Tutorial](https://mmaction2.readthedocs.io/en/latest/user_guides/4_train_test.html). For more information on `Runner`, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html). ```python from mmengine.runner import Runner diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md index 9d48be6030..294bf17443 100644 --- a/docs/en/get_started/installation.md +++ b/docs/en/get_started/installation.md @@ -45,7 +45,7 @@ We recommend that users follow our best practices to install MMAction2. However, ```shell pip install -U openmim -mim install mmengine 'mmcv>=2.0.0rc1' +mim install mmengine 'mmcv>=2.0.0' ``` **Step 2.** Install MMAction2. @@ -80,7 +80,7 @@ git checkout dev-1.x Just install with pip. ```shell -pip install "mmaction2>=1.0rc0" +pip install "mmaction2>=1.0.0" ``` ## Verify the installation @@ -167,7 +167,7 @@ This requires manually specifying a find-url based on PyTorch version and its CU For example, the following command install mmcv built for PyTorch 1.10.x and CUDA 11.3. ```shell -pip install 'mmcv>=2.0.0rc1' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html +pip install 'mmcv>=2.0.0' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html ``` ### Install on CPU-only platforms diff --git a/docs/en/get_started/overview.md b/docs/en/get_started/overview.md index 4857991711..0ddc07b275 100644 --- a/docs/en/get_started/overview.md +++ b/docs/en/get_started/overview.md @@ -32,66 +32,66 @@ We have prepared a wealth of documents to meet your various needs:
For the basic usage of MMAction2 -- [Installation](docs/en/get_started/installation.md) -- [Quick Run](docs/en/get_started/quick_run.md) -- [Inference](docs/en/user_guides/Inference.md) +- [Installation](installation.md) +- [Quick Run](quick_run.md) +- [Inference](../user_guides/Inference.md)
For training on supported dataset -- [learn about configs](docs/en/user_guides/config.md) -- [prepare dataset](docs/en/get_started/prepare_dataset.md) -- [Training and testing](docs/en/user_guides/train_test.md) +- [learn about configs](../user_guides/config.md) +- [prepare dataset](../user_guides/prepare_dataset.md) +- [Training and testing](../user_guides/train_test.md)
For looking for some common issues -- [FAQs](docs/en/get_started/faq.md) -- [Useful tools](docs/en/useful_tools.md) +- [FAQs](faq.md) +- [Useful tools](../useful_tools.md)
For a general understanding about MMAction2 -- [20-minute tour to MMAction2](docs/en/get_started/20-minute_tour.md) -- [Data flow in MMAction2](docs/en/advanced_guides/dataflow.md) +- [20-minute tour to MMAction2](guide_to_framework.md) +- [Data flow in MMAction2](../advanced_guides/dataflow.md)
For advanced usage about custom training -- [Customize models](docs/en/advanced_guides/customize_models.md) -- [Customize datasets](docs/en/advanced_guides/customize_dataset.md) -- [Customize data transformation and augmentation](docs/en/advanced_guides/customize_pipeline.md) -- [Customize optimizer and scheduler](docs/en/advanced_guides/customize_optimizer.md) -- [Customize logging](docs/en/advanced_guides/customize_logging.md) +- [Customize models](../advanced_guides/customize_models.md) +- [Customize datasets](../advanced_guides/customize_dataset.md) +- [Customize data transformation and augmentation](../advanced_guides/customize_pipeline.md) +- [Customize optimizer and scheduler](../advanced_guides/customize_optimizer.md) +- [Customize logging](../advanced_guides/customize_logging.md)
For supported model zoo and dataset zoo -- [Model Zoo](model_zoo/modelzoo.md) -- [Dataset Zoo](datasetzoo.md) +- [Model Zoo](../model_zoo/modelzoo.md) +- [Dataset Zoo](../datasetzoo.md)
For migration from MMAction2 0.x -- [Migration](migration.md) +- [Migration](../migration.md)
For researchers and developers who are willing to contribute to MMAction2 -- [Contribution Guide](get_started/contribution_guide.md) +- [Contribution Guide](contribution_guide.md)
diff --git a/docs/en/index.rst b/docs/en/index.rst index 73a4590f00..ed4062534e 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -46,6 +46,12 @@ You can switch between Chinese and English documents in the lower-left corner of migration.md +.. toctree:: + :maxdepth: 1 + :caption: API Reference + + api.rst + .. toctree:: :maxdepth: 1 :caption: Model Zoo diff --git a/docs/en/merge_docs.sh b/docs/en/merge_docs.sh index 5a3c86b7ac..0d3c90ef0e 100644 --- a/docs/en/merge_docs.sh +++ b/docs/en/merge_docs.sh @@ -2,16 +2,18 @@ # gather models mkdir -p model_zoo -cat ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\/en/](../g' |sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/localization_models.md -cat ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/recognition_models.md -cat ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" >> model_zoo/recognition_models.md -cat ../../configs/detection/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/detection_models.md -cat ../../configs/skeleton/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/skeleton_models.md +cat ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' |sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/localization_models.md +cat ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/recognition_models.md +cat ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >> model_zoo/recognition_models.md +cat ../../configs/detection/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/detection_models.md +cat ../../configs/skeleton/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/skeleton_models.md # gather projects # TODO: generate table of contents for project zoo cat ../../projects/README.md > projectzoo.md -cat ../../projects/*/README.md >> projectzoo.md +cat ../../projects/example_project/README.md >> projectzoo.md +cat ../../projects/ctrgcn/README.md >> projectzoo.md +cat ../../projects/msg3d/README.md >> projectzoo.md # gather datasets cat supported_datasets.md > datasetzoo.md @@ -38,8 +40,9 @@ sed -i 's/(\/tools\/data\/skeleton\/README.md/(#skeleton-dataset/g' datasetzoo.m cat prepare_data.md >> datasetzoo.md + sed -i 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' *.md + sed -i 's/](\/docs\/en\//](g' datasetzoo.md sed -i 's/](\/docs\/en\//](g' changelog.md - sed -i 's/](\/docs\/en\//](..g' ./get_stated/*.md sed -i 's/](\/docs\/en\//](..g' ./tutorials/*.md diff --git a/docs/en/migration.md b/docs/en/migration.md index 2917455f80..ea2ecac06c 100644 --- a/docs/en/migration.md +++ b/docs/en/migration.md @@ -4,10 +4,10 @@ MMAction2 1.x introduced major refactorings and modifications including some BC- ## New dependencies -MMAction2 1.x depends on the following packages. You are recommended to prepare a new clean environment and install them according to [install tutorial](./get_started.md) +MMAction2 1.x depends on the following packages. You are recommended to prepare a new clean environment and install them according to [install tutorial](./get_started/installation.md) 1. [MMEngine](https://github.com/open-mmlab/mmengine): MMEngine is a foundational library for training deep learning model introduced in OpenMMLab 2.0 architecture. -2. [MMCV](https://github.com/open-mmlab/mmcv): MMCV is a foundational library for computer vision. MMAction2 1.x requires `mmcv>=2.0.0rc0` which is more compact and efficient than `mmcv-full==1.x`. +2. [MMCV](https://github.com/open-mmlab/mmcv): MMCV is a foundational library for computer vision. MMAction2 1.x requires `mmcv>=2.0.0` which is more compact and efficient than `mmcv-full==2.0.0`. ## Configuration files diff --git a/docs/en/switch_language.md b/docs/en/switch_language.md index 0009eafa9e..80cf0dc571 100644 --- a/docs/en/switch_language.md +++ b/docs/en/switch_language.md @@ -1,3 +1,3 @@ -## English +## English ## 简体中文 diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md index b98358d166..51742edc72 100644 --- a/docs/zh_cn/get_started.md +++ b/docs/zh_cn/get_started.md @@ -46,7 +46,7 @@ conda install pytorch torchvision cpuonly -c pytorch ```shell pip install -U openmim -mim install mmengine 'mmcv>=2.0.0rc1' +mim install mmengine 'mmcv>=2.0.0' ``` **第二步** 安装 MMAction2。 @@ -80,7 +80,7 @@ git checkout dev-1.x 直接使用 pip 安装即可。 ```shell -pip install "mmaction2>=1.0rc0" +pip install "mmaction2>=1.0.0" ``` ## 验证安装 @@ -158,7 +158,7 @@ MMCV 包含 C++ 和 CUDA 扩展,因此其对 PyTorch 的依赖比较复杂。 例如,以下命令安装为 PyTorch 1.10.x 和 CUDA 11.3 构建的 mmcv。 ```shell -pip install 'mmcv>=2.0.0rc1' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html +pip install 'mmcv>=2.0.0' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html ``` ### 在 CPU 环境中安装 diff --git a/docs/zh_cn/switch_language.md b/docs/zh_cn/switch_language.md index 0009eafa9e..80cf0dc571 100644 --- a/docs/zh_cn/switch_language.md +++ b/docs/zh_cn/switch_language.md @@ -1,3 +1,3 @@ -## English +## English ## 简体中文 diff --git a/projects/ctrgcn/README.md b/projects/ctrgcn/README.md index 809af449f5..b62bee8d86 100644 --- a/projects/ctrgcn/README.md +++ b/projects/ctrgcn/README.md @@ -20,7 +20,7 @@ Graph convolutional networks (GCNs) have been widely used and achieved remarkabl ### Setup Environment -Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2. +Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started.html) to install MMAction2. At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it. diff --git a/projects/example_project/README.md b/projects/example_project/README.md index ef74fe9cbe..30cadfb8ed 100644 --- a/projects/example_project/README.md +++ b/projects/example_project/README.md @@ -8,7 +8,7 @@ according to your project. ### Setup Environment -Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2. +Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started.html) to install MMAction2. At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it. diff --git a/projects/msg3d/README.md b/projects/msg3d/README.md index 7c784f90aa..a46c800acc 100644 --- a/projects/msg3d/README.md +++ b/projects/msg3d/README.md @@ -20,7 +20,7 @@ Spatial-temporal graphs have been widely used by skeleton-based action recogniti ### Setup Environment -Please refer to [Get Started](https://mmaction2.readthedocs.io/en/1.x/get_started.html) to install MMAction2. +Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started.html) to install MMAction2. At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it. diff --git a/tests/models/utils/test_gradcam.py b/tests/models/utils/test_gradcam.py index 4cc8b1b8b0..5fc4173be0 100644 --- a/tests/models/utils/test_gradcam.py +++ b/tests/models/utils/test_gradcam.py @@ -119,7 +119,7 @@ def test_r2plus1d(): recognizer = MODELS.build(config.model) recognizer.cfg = config - input_shape = (1, 3, 3, 8, 32, 32) + input_shape = (1, 3, 3, 8, 16, 16) target_layer_name = 'backbone/layer4/1/relu' _do_test_3D_models(recognizer, target_layer_name, input_shape) From 85e3492cdea209b673911948ebe932edf841f369 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Thu, 6 Apr 2023 21:56:38 +0800 Subject: [PATCH 35/36] Bump version to 1.0.0 (#2343) --- .github/ISSUE_TEMPLATE/1-bug-report.yml | 100 ++++++++++++++++++ .github/ISSUE_TEMPLATE/2-feature-request.yml | 33 ++++++ .github/ISSUE_TEMPLATE/3-documentation.yml | 23 ++++ .github/ISSUE_TEMPLATE/config.yml | 13 ++- .github/ISSUE_TEMPLATE/error-report.md | 56 ---------- .github/ISSUE_TEMPLATE/feature_request.md | 33 ------ .github/ISSUE_TEMPLATE/general_questions.md | 14 --- .../reimplementation_questions.md | 70 ------------ README.md | 9 +- README_zh-CN.md | 9 +- docs/en/conf.py | 2 +- docs/en/notes/changelog.md | 44 ++++++++ mmaction/version.py | 2 +- 13 files changed, 220 insertions(+), 188 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/1-bug-report.yml create mode 100644 .github/ISSUE_TEMPLATE/2-feature-request.yml create mode 100644 .github/ISSUE_TEMPLATE/3-documentation.yml delete mode 100644 .github/ISSUE_TEMPLATE/error-report.md delete mode 100644 .github/ISSUE_TEMPLATE/feature_request.md delete mode 100644 .github/ISSUE_TEMPLATE/general_questions.md delete mode 100644 .github/ISSUE_TEMPLATE/reimplementation_questions.md diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml new file mode 100644 index 0000000000..809a23e3c9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml @@ -0,0 +1,100 @@ +name: "🐞 Bug Report" +description: "Create a report to help us reproduce and fix the bug" +labels: Bug +title: "[Bug] " + +body: + - type: markdown + attributes: + value: | + If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [here](https://github.com/open-mmlab/mmaction2/pulls)! + If this issue is about installing MMCV, please file an issue at [MMCV](https://github.com/open-mmlab/mmcv/issues/new/choose). + If you need our help, please fill in as much of the following form as you're able. + + **The less clear the description, the longer it will take to solve it.** + + - type: dropdown + id: version + attributes: + label: Branch + description: Which branch/version are you using? + options: + - master branch (0.x version, such as `v0.10.0`, or `dev` branch) + - 1.x branch (1.x version, such as `v1.0.0rc2`, or `dev-1.x` branch) + validations: + required: true + + - type: checkboxes + attributes: + label: Prerequisite + description: Please check the following items before creating a new issue. + options: + - label: I have searched [Issues](https://github.com/open-mmlab/mmaction2/issues) and [Discussions](https://github.com/open-mmlab/mmaction2/discussions) but cannot get the expected help. + required: true + - label: I have read the [documentation](https://mmaction2.readthedocs.io/en/latest/) but cannot get the expected help. + required: true + - label: The bug has not been fixed in the [latest version](https://github.com/open-mmlab/mmaction2). + required: true + + - type: textarea + attributes: + label: Environment + description: | + Please run `python mmaction2/utils/collect_env.py` to collect necessary environment information and copy-paste it here. + You may add additional information that may be helpful for locating the problem, such as + - How you installed PyTorch \[e.g., pip, conda, source\] + - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) + validations: + required: true + + - type: textarea + id: description + validations: + required: true + attributes: + label: Describe the bug + description: | + Please provide a clear and concise description of what the bug is. + Preferably a simple and minimal code snippet is provided below, so that we can reproduce the error by running the code. + placeholder: | + A clear and concise description of what the bug is. + + - type: textarea + attributes: + label: Reproduces the problem - code sample + description: | + Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. + Did you make any modifications on the code or config? Are you clear about what you have modified? + placeholder: | + ```python + # Sample code to reproduce the problem + ``` + + - type: textarea + attributes: + label: Reproduces the problem - command or script + description: | + What command or script did you run? + placeholder: | + ```shell + The command or script you run. + ``` + + - type: textarea + attributes: + label: Reproduces the problem - error message + description: | + Please provide the error message or logs you got, with the full traceback. + placeholder: | + ``` + The error message or logs you got, with the full traceback. + ``` + + - type: textarea + attributes: + label: Additional information + description: Tell us anything else you think we should know. + placeholder: | + 1. What's your expected result? + 2. What dataset did you use? + 3. What do you think might be the reason? diff --git a/.github/ISSUE_TEMPLATE/2-feature-request.yml b/.github/ISSUE_TEMPLATE/2-feature-request.yml new file mode 100644 index 0000000000..c32c477133 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/2-feature-request.yml @@ -0,0 +1,33 @@ +name: 🚀 Feature Request +description: Suggest an idea for this project +labels: [Feature] +title: "[Feature] " + +body: + - type: markdown + attributes: + value: | + We strongly appreciate you creating a PR to implete this feature [here](https://github.com/open-mmlab/mmaction2/pulls)! + If you need our help, please fill in as much of the following form as you're able. + + **The less clear the description, the longer it will take to solve it.** + + - type: textarea + attributes: + label: What is the problem this feature will solve? + placeholder: | + E.g., It is inconvenient when \[....\]. + validations: + required: true + + - type: textarea + attributes: + label: What is the feature? + validations: + required: true + + - type: textarea + attributes: + label: What alternatives have you considered? + description: | + Add any other context or screenshots about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/3-documentation.yml b/.github/ISSUE_TEMPLATE/3-documentation.yml new file mode 100644 index 0000000000..f47353edd4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/3-documentation.yml @@ -0,0 +1,23 @@ +name: 📚 Documentation Issue +description: Report an issue related to https://mmaction2.readthedocs.io/en/latest/. +labels: "Documentation" +title: "[Docs] " + +body: +- type: textarea + attributes: + label: The doc issue + description: > + A clear and concise description of what content in https://mmaction2.readthedocs.io/en/latest/ is an issue. + validations: + required: true + +- type: textarea + attributes: + label: Suggest a potential alternative/fix + description: > + Tell us how we could improve the documentation in this regard. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index a772220430..d41e7bd45f 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,9 +1,12 @@ blank_issues_enabled: false contact_links: - - name: Common Issues - url: https://mmaction2.readthedocs.io/en/latest/faq.html - about: Check if your issue already has solutions - - name: MMAction2 Documentation - url: https://mmaction2.readthedocs.io/en/latest/ + - name: 📚 MMAction2 Documentation (官方文档) + url: https://mmaction2.readthedocs.io/en/latest about: Check if your question is answered in docs + - name: 💬 Forum (寻求帮助) + url: https://github.com/open-mmlab/mmaction2/discussions + about: Ask general usage questions and discuss with other MMAction2 community members + - name: 🌐 Explore OpenMMLab (官网) + url: https://openmmlab.com/ + about: Get know more about OpenMMLab diff --git a/.github/ISSUE_TEMPLATE/error-report.md b/.github/ISSUE_TEMPLATE/error-report.md deleted file mode 100644 index 60206eaba2..0000000000 --- a/.github/ISSUE_TEMPLATE/error-report.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -name: Error report -about: Create a report to help us improve -title: '' -labels: '' -assignees: '' ---- - -Thanks for your error report and we appreciate it a lot. -If you feel we have helped you, give us a STAR! :satisfied: - -**Checklist** - -1. I have searched related issues but cannot get the expected help. -2. The bug has not been fixed in the latest version. - -**Describe the bug** - -A clear and concise description of what the bug is. - -**Reproduction** - -- What command or script did you run? - -``` -A placeholder for the command. -``` - -- What config did you run? - -``` -A placeholder for the config. -``` - -- Did you make any modifications on the code or config? Did you understand what you have modified? -- What dataset did you use? - -**Environment** - -1. Please run `PYTHONPATH=${PWD}:$PYTHONPATH python mmaction/utils/collect_env.py` to collect necessary environment information and paste it here. -2. You may add addition that may be helpful for locating the problem, such as - -- How you installed PyTorch \[e.g., pip, conda, source\] -- Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) - -**Error traceback** - -If applicable, paste the error traceback here. - -``` -A placeholder for traceback. -``` - -**Bug fix** - -If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated! diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 81ce7f60be..0000000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -name: Feature request -about: Suggest an idea for this project -title: '' -labels: '' -assignees: '' ---- - -Thanks for your feature request and we will review and plan for it when necessary. -If you feel we have helped you, give us a STAR! :satisfied: - -**Steps** - -1. Check if the feature has been requested in the [meta issue](https://github.com/open-mmlab/mmaction2/issues/19), and if so, click thumb up button. -2. Post the feature request in the [meta issue](https://github.com/open-mmlab/mmaction2/issues/19), if it is new. - -**Describe the feature** - -**Motivation** - -A clear and concise description of the motivation of the feature. - -1. Ex1. It is inconvenient when \[....\]. -2. Ex2. There is a recent paper \[....\], which is very helpful for \[....\]. - -**Related resources** - -If there is an official code released or third-party implementations, please also provide the information here, which would be very helpful. - -**Additional context** - -Add any other context or screenshots about the feature request here. -If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated. diff --git a/.github/ISSUE_TEMPLATE/general_questions.md b/.github/ISSUE_TEMPLATE/general_questions.md deleted file mode 100644 index 5aa583cb1c..0000000000 --- a/.github/ISSUE_TEMPLATE/general_questions.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -name: General questions -about: Ask general questions to get help -title: '' -labels: '' -assignees: '' ---- - -Before raising a question, you may need to check the following listed items. - -**Checklist** - -1. I have searched related issues but cannot get the expected help. -2. I have read the [FAQ documentation](https://mmaction2.readthedocs.io/en/latest/faq.html) but cannot get the expected help. diff --git a/.github/ISSUE_TEMPLATE/reimplementation_questions.md b/.github/ISSUE_TEMPLATE/reimplementation_questions.md deleted file mode 100644 index 683e5d6fa4..0000000000 --- a/.github/ISSUE_TEMPLATE/reimplementation_questions.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -name: Reimplementation Questions -about: Ask about questions during model reimplementation -title: '' -labels: reimplementation -assignees: '' ---- - -If you feel we have helped you, give us a STAR! :satisfied: - -**Notice** - -There are several common situations in the reimplementation issues as below - -1. Reimplement a model in the model zoo using the provided configs. -2. Reimplement a model in the model zoo on other dataset (e.g., custom datasets). -3. Reimplement a custom model but all the components are implemented in MMAction2. -4. Reimplement a custom model with new modules implemented by yourself. - -There are several things to do for different cases as below. - -- For case 1 & 3, please follow the steps in the following sections thus we could help to quick identify the issue. -- For case 2 & 4, please understand that we are not able to do much help here because we usually do not know the full code and the users should be responsible to the code they write. -- One suggestion for case 2 & 4 is that the users should first check whether the bug lies in the self-implemented code or the original code. For example, users can first make sure that the same model runs well on supported datasets. If you still need help, please describe what you have done and what you obtain in the issue, and follow the steps in the following sections and try as clear as possible so that we can better help you. - -**Checklist** - -1. I have searched related issues but cannot get the expected help. -2. The issue has not been fixed in the latest version. - -**Describe the issue** - -A clear and concise description of what the problem you meet and what have you done. - -**Reproduction** - -- What command or script did you run? - -``` -A placeholder for the command. -``` - -- What config dir you run? - -``` -A placeholder for the config. -``` - -- Did you make any modifications on the code or config? Did you understand what you have modified? -- What dataset did you use? - -**Environment** - -1. Please run `PYTHONPATH=${PWD}:$PYTHONPATH python mmaction/utils/collect_env.py` to collect necessary environment information and paste it here. -2. You may add addition that may be helpful for locating the problem, such as - -- How you installed PyTorch \[e.g., pip, conda, source\] -- Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) - -**Results** - -If applicable, paste the related results here, e.g., what you expect and what you get. - -``` -A placeholder for results comparison -``` - -**Issue fix** - -If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated! diff --git a/README.md b/README.md index f01bf8cd5f..064d4526f5 100644 --- a/README.md +++ b/README.md @@ -75,11 +75,12 @@ English | [简体中文](/README_zh-CN.md) **The default branch has been switched to `1.x` from `master`, and we encourage users to migrate to the latest version with more supported models, stronger pre-training checkpoints and simpler coding. Please refer to [Migration Guide](https://mmaction2.readthedocs.io/en/latest/migration.html) for more details.** -**Release (2023.02.10)**: v1.0.0rc3 with the following new features: +**Release (2023.04.06)**: v1.0.0 with the following new features: -- Support Action Recognition model UniFormer V1(ICLR'2022), UniFormer V2(Arxiv'2022). -- Support training MViT V2(CVPR'2022), and MaskFeat(CVPR'2022) fine-tuning. -- Add a new handy interface for inference MMAction2 models ([demo](https://github.com/open-mmlab/mmaction2/blob/dev-1.x/demo/README.md#inferencer)) +- Support RGB-PoseC3D(CVPR'2022). +- Support training UniFormer V2(Arxiv'2022). +- Support MSG3D(CVPR'2020) and CTRGCN(CVPR'2021) in projects. +- Refactor and provide more user-friendly documentation. ## 📖 Introduction [🔝](#-table-of-contents) diff --git a/README_zh-CN.md b/README_zh-CN.md index 5b03609b9d..493d2da15e 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -91,11 +91,12 @@ MMAction2 是一款基于 PyTorch 的视频理解开源工具箱,是 [OpenMMLa ## 更新记录 -**v1.0.0rc3 版本 (2023.02.10)**: +**v1.0.0 版本 (2023.04.06)**: -- 支持动作识别模型 UniFormer V1(ICLR'2022),UniFormer V2(Arxiv'2022) -- 支持训练 MViT V2(CVPR'2022)和 MaskFeat(CVPR'2022)微调 -- 为 MMAction2 模型提供统一的推理接口实现视频分析任务的快速预测 ([demo](https://github.com/open-mmlab/mmaction2/blob/dev-1.x/demo/README.md#inferencer)) +- 支持骨骼动作识别模型 RGB-PoseC3D (CVPR'2022) . +- 在 Projects 中支持 MSG3D(CVPR'2020) 和 CTRGCN(CVPR'2021). +- 支持训练 UniFormer V2(Arxiv'2022). +- 重构升级用户文档 ## 安装 diff --git a/docs/en/conf.py b/docs/en/conf.py index ba54a05953..6623d99b45 100644 --- a/docs/en/conf.py +++ b/docs/en/conf.py @@ -17,7 +17,7 @@ import pytorch_sphinx_theme -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath('../..')) # -- Project information ----------------------------------------------------- diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md index bbf5b9ffbc..b4d785bc8a 100644 --- a/docs/en/notes/changelog.md +++ b/docs/en/notes/changelog.md @@ -1,5 +1,49 @@ # Changelog +## 1.0.0 (4/6/2023) + +**Highlights** + +- Support RGB-PoseC3D(CVPR'2022). +- Support training UniFormer V2(Arxiv'2022). +- Support MSG3D(CVPR'2020) and CTRGCN(CVPR'2021) in projects. +- Refactor and provide more user-friendly documentation. + +**New Features** + +- Support RGB-PoseC3D ([2182](https://github.com/open-mmlab/mmaction2/pull/2182)) +- Support training UniFormer V2 ([2221](https://github.com/open-mmlab/mmaction2/pull/2221)) +- Support MSG3D and CTRGCN in projects. ([2269](https://github.com/open-mmlab/mmaction2/pull/2269), [2291](https://github.com/open-mmlab/mmaction2/pull/2291)) + +**Improvements** + +- Use MMEngine to calculate FLOPs ([2300](https://github.com/open-mmlab/mmaction2/pull/2300)) +- Speed up LFB training ([2294](https://github.com/open-mmlab/mmaction2/pull/2294)) +- Support multiprocessing on AVA evaluation ([2146](https://github.com/open-mmlab/mmaction2/pull/2146)) +- Add a demo for exporting spatial-temporal detection model to ONNX ([2225](https://github.com/open-mmlab/mmaction2/pull/2225)) +- Update spatial-temporal detection related folders ([2262](https://github.com/open-mmlab/mmaction2/pull/2262)) + +**Bug Fixes** + +- Fix flip config of TSM for sth v1/v2 dataset ([#2247](https://github.com/open-mmlab/mmaction2/pull/2247)) +- Fix circle ci ([2336](https://github.com/open-mmlab/mmaction2/pull/2336), [2334](https://github.com/open-mmlab/mmaction2/pull/2334)) +- Fix accepting an unexpected argument local-rank in PyTorch 2.0 ([2320](https://github.com/open-mmlab/mmaction2/pull/2320)) +- Fix TSM config link ([2315](https://github.com/open-mmlab/mmaction2/pull/2315)) +- Fix numpy version requirement in CI ([2284](https://github.com/open-mmlab/mmaction2/pull/2284)) +- Fix NTU pose extraction script ([2246](https://github.com/open-mmlab/mmaction2/pull/2246)) +- Fix TSM-MobileNet V2 ([2332](https://github.com/open-mmlab/mmaction2/pull/2332)) +- Fix command bugs in localization tasks' README ([2244](https://github.com/open-mmlab/mmaction2/pull/2244)) +- Fix duplicate name in DecordInit and SampleAVAFrame ([2251](https://github.com/open-mmlab/mmaction2/pull/2251)) +- Fix channel order when showing video ([2308](https://github.com/open-mmlab/mmaction2/pull/2308)) +- Specify map_location to cpu when using \_load_checkpoint ([2252](https://github.com/open-mmlab/mmaction2/pull/2254)) + +**Documentation** + +- Refactor and provide more user-friendly documentation ([2341](https://github.com/open-mmlab/mmaction2/pull/2341), [2312](https://github.com/open-mmlab/mmaction2/pull/2312), [2325](https://github.com/open-mmlab/mmaction2/pull/2325)) +- Add README_zh-CN ([2252](https://github.com/open-mmlab/mmaction2/pull/2252)) +- Add social networking links ([2294](https://github.com/open-mmlab/mmaction2/pull/2294)) +- Fix sthv2 dataset annotations preparation document ([2248](https://github.com/open-mmlab/mmaction2/pull/2248)) + ## 1.0.0rc3 (2/10/2023) **Highlights** diff --git a/mmaction/version.py b/mmaction/version.py index 5a0a756926..76d189b4d2 100644 --- a/mmaction/version.py +++ b/mmaction/version.py @@ -1,6 +1,6 @@ # Copyright (c) Open-MMLab. All rights reserved. -__version__ = '1.0.0rc3' +__version__ = '1.0.0' def parse_version_info(version_str: str): From 7b77f29aab133f2e7c5dbc52129f1875a5263247 Mon Sep 17 00:00:00 2001 From: cir7 <33249023+cir7@users.noreply.github.com> Date: Fri, 7 Apr 2023 10:29:12 +0800 Subject: [PATCH 36/36] [CI] fix circle CI (#2351) --- .circleci/test.yml | 36 ++++++++----------- .../datasets/transforms/pose_transforms.py | 2 +- 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/.circleci/test.yml b/.circleci/test.yml index 8ead3de0a8..efa9342303 100644 --- a/.circleci/test.yml +++ b/.circleci/test.yml @@ -98,10 +98,10 @@ jobs: type: string cuda: type: enum - enum: ["10.1", "10.2", "11.1"] + enum: ["11.0"] cudnn: type: integer - default: 7 + default: 8 machine: image: ubuntu-2004-cuda-11.4:202110-01 # docker_layer_caching: true @@ -114,9 +114,15 @@ jobs: docker build .circleci/docker -t mmaction:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >> docker run --gpus all -t -d -v /home/circleci/project:/mmaction -w /mmaction --name mmaction mmaction:gpu docker exec mmaction apt-get update + docker exec mmaction pip install "numpy==1.23" docker exec mmaction apt-get upgrade -y docker exec mmaction apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libturbojpeg pkg-config docker exec mmaction apt-get install -y libavdevice-dev libavfilter-dev libopus-dev libvpx-dev libsrtp2-dev libsndfile1 + - run: + name: Install PytorchVideo and timm + command: | + docker exec mmaction pip install timm + docker exec mmaction python -m pip install pytorchvideo - run: name: Install mmaction dependencies command: | @@ -126,21 +132,6 @@ jobs: docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmdetection.git@dev-3.x docker exec mmaction pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x docker exec mmaction pip install -r requirements.txt - - when: - condition: - equal: [ "1.8.1", << parameters.torch >> ] - steps: - - run: docker exec mmaction pip install timm - - when: - condition: - equal: [ "1.6.0", << parameters.torch >> ] - steps: - - run: docker exec mmaction pip install timm==0.6.7 - - when: - condition: - equal: [ "10.2", << parameters.cuda >> ] - steps: - - run: docker exec mmaction python -m pip install pytorchvideo - run: name: Build and install command: | @@ -159,7 +150,7 @@ workflows: branches: ignore: - dev-1.x - - 1.x + - main pr_stage_test: when: not: @@ -171,7 +162,7 @@ workflows: branches: ignore: - dev-1.x - - 1.x + - main - build_cpu: name: minimum_version_cpu torch: 1.6.0 @@ -195,7 +186,7 @@ workflows: torch: 1.8.1 # Use double quotation mark to explicitly specify its type # as string instead of number - cuda: "10.2" + cuda: "11.0" requires: - hold merge_stage_test: @@ -205,11 +196,12 @@ workflows: jobs: - build_cuda: name: minimum_version_gpu - torch: 1.6.0 + torch: 1.7.1 # Use double quotation mark to explicitly specify its type # as string instead of number - cuda: "10.1" + cuda: "11.0" filters: branches: only: - dev-1.x + - main diff --git a/mmaction/datasets/transforms/pose_transforms.py b/mmaction/datasets/transforms/pose_transforms.py index 76e09dacd8..0abb987551 100644 --- a/mmaction/datasets/transforms/pose_transforms.py +++ b/mmaction/datasets/transforms/pose_transforms.py @@ -1348,7 +1348,7 @@ def transform(self, results: Dict) -> Dict: else: inds = self._get_train_clips(num_frames, clip_len) inds = np.mod(inds, num_frames) - results[f'{modality}_inds'] = inds.astype(np.int) + results[f'{modality}_inds'] = inds.astype(np.int32) modalities.append(modality) results['clip_len'] = self.clip_len results['frame_interval'] = None