diff --git a/configs/_base_/models/tin_r50.py b/configs/_base_/models/tin_r50.py index 4a0dbbf6a0..cc27704f07 100644 --- a/configs/_base_/models/tin_r50.py +++ b/configs/_base_/models/tin_r50.py @@ -1,7 +1,9 @@ # model settings preprocess_cfg = dict( - mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], format_shape='NCHW') + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + format_shape='NCHW') model = dict( type='Recognizer2D', diff --git a/configs/recognition/tin/README.md b/configs/recognition/tin/README.md index 1e3db9145b..a250388fc3 100644 --- a/configs/recognition/tin/README.md +++ b/configs/recognition/tin/README.md @@ -34,7 +34,7 @@ For a long time, the vision community tries to learn the spatio-temporal represe | frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log | | :---------------------: | :------------: | :--: | :------: | :-------------: | :------: | :------: | :--------------: | :---------------------: | :--------: | :-----------------------: | :---------------------: | :---------------------: | -| 1x1x8 | short-side 256 | 8x4 | ResNet50 | TSM-Kinetics400 | 71.77 | 90.36 | 8 clips x 1 crop | x | 6185 | [config](/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log) | +| 1x1x8 | short-side 256 | 8x4 | ResNet50 | TSM-Kinetics400 | 71.86 | 90.44 | 8 clips x 1 crop | x | 6185 | [config](/configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log) | Here, we use `finetune` to indicate that we use [TSM model](https://download.openmmlab.com/mmaction/v1.0/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth) trained on Kinetics-400 to finetune the TIN model on Kinetics-400. diff --git a/configs/recognition/tin/metafile.yml b/configs/recognition/tin/metafile.yml index 6f69c73fda..7954bd90b3 100644 --- a/configs/recognition/tin/metafile.yml +++ b/configs/recognition/tin/metafile.yml @@ -66,8 +66,8 @@ Models: Results: - Dataset: Kinetics-400 Metrics: - Top 1 Accuracy: 71.77 - Top 5 Accuracy: 90.36 + Top 1 Accuracy: 71.86 + Top 5 Accuracy: 90.44 Task: Action Recognition Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb.log Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tin/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb/tin_kinetics400-pretrained-tsm-r50_1x1x8-50e_kinetics400-rgb_20220913-7f10d0c0.pth diff --git a/dataset-index.yml b/dataset-index.yml index 57f2f4f62a..2637f88d4b 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -1,39 +1,40 @@ +openxlab: true kinetics400: - dataset: Kinetics-400 + dataset: OpenMMLab/Kinetics-400 download_root: data data_root: data/kinetics400 - script: tools/data/kinetics/k400_preprocess.sh + script: tools/data/kinetics/preprocess_k400.sh kinetics600: - dataset: Kinetics600 + dataset: OpenMMLab/Kinetics600 download_root: data data_root: data/kinetics600 - script: tools/data/kinetics/k600_preprocess.sh + script: tools/data/kinetics/preprocess_k600.sh kinetics700: - dataset: Kinetics_700 + dataset: OpenMMLab/Kinetics_700 download_root: data data_root: data/kinetics700 - script: tools/data/kinetics/k700_preprocess.sh + script: tools/data/kinetics/preprocess_k700.sh sthv2: - dataset: sthv2 + dataset: OpenDataLab/sthv2 download_root: data data_root: data/sthv2 script: tools/data/sthv2/preprocess.sh ucf-101: - dataset: UCF101 + dataset: OpenDataLab/UCF101 download_root: data data_root: data/ucf101 finegym: - dataset: FineGym + dataset: OpenDataLab/FineGym download_root: data data_root: data/gym diving48: - dataset: diving48 + dataset: OpenDataLab/diving48 download_root: data data_root: data/diving48 script: tools/data/diving48/preprocess.sh diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py new file mode 100644 index 0000000000..1dfd9f976c --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py @@ -0,0 +1,178 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth', # noqa: E501 + prefix='backbone.')), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=339, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/mit/videos/training' +data_root_val = 'data/mit/videos/validation' +ann_file_train = 'data/mit/mit_train_list_videos.txt' +ann_file_val = 'data/mit/mit_val_list_videos.txt' +ann_file_test = 'data/mit/mit_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict( + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=24, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +base_lr = 2e-5 +optim_wrapper = dict( + optimizer=dict( + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1 / 20, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min_ratio=1 / 20, + by_epoch=True, + begin=5, + end=24, + convert_to_iter_based=True) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=512) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py new file mode 100644 index 0000000000..5b57aacfc6 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='backbone.')), + cls_head=dict( + type=UniFormerHead, + dropout_ratio=0.5, + num_classes=400, + in_channels=768, + average_clips='prob', + channel_map= # noqa: E251 + 'configs/recognition/uniformerv2/k710_channel_map/map_k400.json', + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='cls_head.')), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict( + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +base_lr = 2e-6 +optim_wrapper = dict( + optimizer=dict( + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.5, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=4, + eta_min_ratio=0.5, + by_epoch=True, + begin=1, + end=5, + convert_to_iter_based=True) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py new file mode 100644 index 0000000000..4616065b4c --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='backbone.')), + cls_head=dict( + type=UniFormerHead, + dropout_ratio=0.5, + num_classes=600, + in_channels=768, + average_clips='prob', + channel_map= # noqa: E251 + 'configs/recognition/uniformerv2/k710_channel_map/map_k600.json', + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='cls_head.')), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics600/videos_train' +data_root_val = 'data/kinetics600/videos_val' +ann_file_train = 'data/kinetics600/kinetics600_train_list_videos.txt' +ann_file_val = 'data/kinetics600/kinetics600_val_list_videos.txt' +ann_file_test = 'data/kinetics600/kinetics600_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict( + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +base_lr = 2e-6 +optim_wrapper = dict( + optimizer=dict( + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.5, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=4, + eta_min_ratio=0.5, + by_epoch=True, + begin=1, + end=5, + convert_to_iter_based=True) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py new file mode 100644 index 0000000000..32e1bc72e9 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py @@ -0,0 +1,186 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=False, + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='backbone.')), + cls_head=dict( + type=UniFormerHead, + dropout_ratio=0.5, + num_classes=700, + in_channels=768, + average_clips='prob', + channel_map= # noqa: E251 + 'configs/recognition/uniformerv2/k710_channel_map/map_k700.json', + init_cfg=dict( + type='Pretrained', + checkpoint= # noqa: E251 + 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501 + prefix='cls_head.')), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict( + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=5, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +base_lr = 2e-6 +optim_wrapper = dict( + optimizer=dict( + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.5, + by_epoch=True, + begin=0, + end=1, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=4, + eta_min_ratio=0.5, + by_epoch=True, + begin=1, + end=5, + convert_to_iter_based=True) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py new file mode 100644 index 0000000000..84e6f6729f --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=710, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py new file mode 100644 index 0000000000..6db31b373e --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py @@ -0,0 +1,175 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=True, + pretrained='ViT-B/16'), + cls_head=dict( + type=UniFormerHead, + dropout_ratio=0.5, + num_classes=400, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_train' +data_root_val = 'data/kinetics400/videos_val' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict( + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +base_lr = 1e-5 +optim_wrapper = dict( + optimizer=dict( + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.1, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=50, + eta_min_ratio=0.1, + by_epoch=True, + begin=5, + end=55, + convert_to_iter_based=True) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py new file mode 100644 index 0000000000..6b8cb00c13 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.py @@ -0,0 +1,175 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=True, + pretrained='ViT-B/16'), + cls_head=dict( + type=UniFormerHead, + dropout_ratio=0.5, + num_classes=700, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root = 'data/kinetics700/videos_train' +data_root_val = 'data/kinetics700/videos_val' +ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt' +ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt' +ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt' + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict( + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +base_lr = 1e-5 +optim_wrapper = dict( + optimizer=dict( + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.1, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=50, + eta_min_ratio=0.1, + by_epoch=True, + begin=5, + end=55, + convert_to_iter_based=True) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py new file mode 100644 index 0000000000..72527a79be --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics710-rgb.py @@ -0,0 +1,220 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import ConcatDataset, DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, ConcatDataset, DecordDecode, + DecordInit, Flip, FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=16, + width=768, + layers=12, + heads=12, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[8, 9, 10, 11], + n_layers=4, + n_dim=768, + n_head=12, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5], + clip_pretrained=True, + pretrained='ViT-B/16'), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=710, + in_channels=768, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +file_client_args = dict(io_backend='disk') +train_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 256)), + dict( + type=PytorchVideoWrapper, op='RandAugment', magnitude=7, num_layers=4), + dict(type=RandomResizedCrop), + dict(type=Resize, scale=(224, 224), keep_ratio=False), + dict(type=Flip, flip_ratio=0.5), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +val_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=1, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=CenterCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_pipeline = [ + dict(type=DecordInit, **file_client_args), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +# dataset settings +k400_data_root = 'data/kinetics400/videos_train' +k600_data_root = 'data/kinetics600/videos' +k700_data_root = 'data/kinetics700/videos' +k400_data_root_val = 'data/kinetics400/videos_val' +k600_data_root_val = k600_data_root +k700_data_root_val = k700_data_root + +k400_ann_file_train = 'data/kinetics710/k400_train_list_videos.txt' +k600_ann_file_train = 'data/kinetics710/k600_train_list_videos.txt' +k700_ann_file_train = 'data/kinetics710/k700_train_list_videos.txt' + +k400_ann_file_val = 'data/kinetics710/k400_val_list_videos.txt' +k600_ann_file_val = 'data/kinetics710/k600_val_list_videos.txt' +k700_ann_file_val = 'data/kinetics710/k700_val_list_videos.txt' + +k400_trainset = dict( + type=VideoDataset, + ann_file=k400_ann_file_train, + data_prefix=dict(video=k400_data_root), + pipeline=train_pipeline) +k600_trainset = dict( + type=VideoDataset, + ann_file=k600_ann_file_train, + data_prefix=dict(video=k600_data_root), + pipeline=train_pipeline) +k700_trainset = dict( + type=VideoDataset, + ann_file=k700_ann_file_train, + data_prefix=dict(video=k700_data_root), + pipeline=train_pipeline) + +k400_valset = dict( + type=VideoDataset, + ann_file=k400_ann_file_val, + data_prefix=dict(video=k400_data_root_val), + pipeline=val_pipeline, + test_mode=True) +k600_valset = dict( + type=VideoDataset, + ann_file=k600_ann_file_val, + data_prefix=dict(video=k600_data_root_val), + pipeline=val_pipeline, + test_mode=True) +k700_valset = dict( + type=VideoDataset, + ann_file=k700_ann_file_val, + data_prefix=dict(video=k700_data_root_val), + pipeline=val_pipeline, + test_mode=True) + +k400_testset = k400_valset.copy() +k600_testset = k600_valset.copy() +k700_testset = k700_valset.copy() +k400_testset['pipeline'] = test_pipeline +k600_testset['pipeline'] = test_pipeline +k700_testset['pipeline'] = test_pipeline + +k710_trainset = dict( + type=ConcatDataset, datasets=[k400_trainset, k600_trainset, k700_trainset]) +k710_valset = dict( + type=ConcatDataset, datasets=[k400_valset, k600_valset, k700_valset]) +k710_testset = dict( + type=ConcatDataset, + datasets=[k400_testset, k600_testset, k700_testset], +) + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=True), + dataset=k710_trainset) +val_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=k710_valset) +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=k710_testset) + +val_evaluator = dict(type=AccMetric) +test_evaluator = dict(type=AccMetric) +train_cfg = dict( + type=EpochBasedTrainLoop, max_epochs=55, val_begin=1, val_interval=1) +val_cfg = dict(type=ValLoop) +test_cfg = dict(type=TestLoop) + +base_lr = 1e-5 +optim_wrapper = dict( + optimizer=dict( + type=AdamW, lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), + paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), + clip_grad=dict(max_norm=20, norm_type=2)) + +param_scheduler = [ + dict( + type=LinearLR, + start_factor=0.5, + by_epoch=True, + begin=0, + end=5, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + T_max=50, + eta_min_ratio=0.5, + by_epoch=True, + begin=5, + end=55, + convert_to_iter_based=True) +] + +default_hooks.update( + dict( + checkpoint=dict(interval=3, max_keep_ckpts=5), + logger=dict(interval=100))) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (8 samples per GPU). +auto_scale_lr = dict(enable=True, base_batch_size=256) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py new file mode 100644 index 0000000000..8c53a18dee --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 16 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=400, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k400' +ann_file_test = 'data/k400/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py new file mode 100644 index 0000000000..84d1b295ef --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 16 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=600, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k600' +ann_file_test = 'data/k600/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py new file mode 100644 index 0000000000..b94bb75abf --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 16 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=700, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k700' +ann_file_test = 'data/k700/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py new file mode 100644 index 0000000000..f1b8def59a --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 32 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=400, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k400' +ann_file_test = 'data/k400/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py new file mode 100644 index 0000000000..c6e16ef759 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 32 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=600, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k600' +ann_file_test = 'data/k600/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py new file mode 100644 index 0000000000..e715fca14f --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 32 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=700, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k700' +ann_file_test = 'data/k700/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py new file mode 100644 index 0000000000..6391e01825 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=400, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k400' +ann_file_test = 'data/k400/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py new file mode 100644 index 0000000000..dec1a65b6b --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=600, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k600' +ann_file_test = 'data/k600/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py new file mode 100644 index 0000000000..8bc6cb4407 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=700, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k700' +ann_file_test = 'data/k700/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py new file mode 100644 index 0000000000..c85b802da4 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=710, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py new file mode 100644 index 0000000000..373fe9f3bf --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 32 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=336, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=400, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k400' +ann_file_test = 'data/k400/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 336)), + dict(type=ThreeCrop, crop_size=336), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=4, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py new file mode 100644 index 0000000000..3f1964d2c7 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 32 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=336, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=600, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k600' +ann_file_test = 'data/k600/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 336)), + dict(type=ThreeCrop, crop_size=336), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=4, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py new file mode 100644 index 0000000000..0ef24778f9 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 32 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=336, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=700, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/k700' +ann_file_test = 'data/k700/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=2, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 336)), + dict(type=ThreeCrop, crop_size=336), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=4, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=',')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py new file mode 100644 index 0000000000..798a215bd1 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py @@ -0,0 +1,54 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 32 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=336, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=710, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py new file mode 100644 index 0000000000..2687bec030 --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=224, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=339, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/mit_v1' +ann_file_test = 'data/mit_v1/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 224)), + dict(type=ThreeCrop, crop_size=224), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=32, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=' ')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py new file mode 100644 index 0000000000..bddc27e89a --- /dev/null +++ b/mmaction/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.config import read_base + +with read_base(): + from ..._base_.default_runtime import * + +from mmengine.dataset import DefaultSampler +from mmengine.optim import CosineAnnealingLR, LinearLR +from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop +from torch.optim import AdamW + +from mmaction.datasets import (CenterCrop, DecordDecode, DecordInit, Flip, + FormatShape, PackActionInputs, + PytorchVideoWrapper, RandomResizedCrop, Resize, + ThreeCrop, UniformSample, VideoDataset) +from mmaction.evaluation import AccMetric +from mmaction.models import (ActionDataPreprocessor, Recognizer3D, + TimeSformerHead, UniFormerHead, UniFormerV2) + +# model settings +num_frames = 8 +model = dict( + type=Recognizer3D, + backbone=dict( + type=UniFormerV2, + input_resolution=336, + patch_size=14, + width=1024, + layers=24, + heads=16, + t_size=num_frames, + dw_reduction=1.5, + backbone_drop_path_rate=0., + temporal_downsample=False, + no_lmhra=True, + double_lmhra=True, + return_list=[20, 21, 22, 23], + n_layers=4, + n_dim=1024, + n_head=16, + mlp_factor=4., + drop_path_rate=0., + mlp_dropout=[0.5, 0.5, 0.5, 0.5]), + cls_head=dict( + type=TimeSformerHead, + dropout_ratio=0.5, + num_classes=339, + in_channels=1024, + average_clips='prob'), + data_preprocessor=dict( + type=ActionDataPreprocessor, + mean=[114.75, 114.75, 114.75], + std=[57.375, 57.375, 57.375], + format_shape='NCTHW')) + +# dataset settings +dataset_type = 'VideoDataset' +data_root_val = 'data/mit_v1' +ann_file_test = 'data/mit_v1/val.csv' + +test_pipeline = [ + dict(type=DecordInit), + dict(type=UniformSample, clip_len=num_frames, num_clips=4, test_mode=True), + dict(type=DecordDecode), + dict(type=Resize, scale=(-1, 336)), + dict(type=ThreeCrop, crop_size=336), + dict(type=FormatShape, input_format='NCTHW'), + dict(type=PackActionInputs) +] + +test_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict( + type=VideoDataset, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True, + delimiter=' ')) + +test_evaluator = dict(type=AccMetric) +test_cfg = dict(type=TestLoop) diff --git a/mmaction/models/backbones/resnet_tin.py b/mmaction/models/backbones/resnet_tin.py index b8ff3659f0..0958546926 100644 --- a/mmaction/models/backbones/resnet_tin.py +++ b/mmaction/models/backbones/resnet_tin.py @@ -325,6 +325,9 @@ def init_structure(self): if len(self.non_local_cfg) != 0: self.make_non_local() + def _get_wrap_prefix(self): + return ['.net2'] + def make_temporal_interlace(self): """Make temporal interlace for some layers.""" num_segment_list = [self.num_segments] * 4 @@ -365,6 +368,3 @@ def make_block_interlace(stage, num_segments, shift_div): self.shift_div) self.layer4 = make_block_interlace(self.layer4, num_segment_list[3], self.shift_div) - - def init_weights(self): - pass diff --git a/mmaction/models/backbones/resnet_tsm.py b/mmaction/models/backbones/resnet_tsm.py index 0079c96cb7..a2dcaf2939 100644 --- a/mmaction/models/backbones/resnet_tsm.py +++ b/mmaction/models/backbones/resnet_tsm.py @@ -305,6 +305,9 @@ def make_non_local(self): self.num_segments, self.non_local_cfg) + def _get_wrap_prefix(self): + return ['.net', '.block'] + def load_original_weights(self, logger): """Load weights from original checkpoint, which required converting keys.""" @@ -317,7 +320,7 @@ def load_original_weights(self, logger): for name, module in self.named_modules(): # convert torchvision keys ori_name = name - for wrap_prefix in ['.net', '.block']: + for wrap_prefix in self._get_wrap_prefix(): if wrap_prefix in ori_name: ori_name = ori_name.replace(wrap_prefix, '') wrapped_layers_map[ori_name] = name @@ -352,6 +355,7 @@ def load_original_weights(self, logger): if layer_name in wrapped_layers_map: wrapped_name = param_name.replace( layer_name, wrapped_layers_map[layer_name]) + print(f'wrapped_name {wrapped_name}') state_dict_torchvision[ wrapped_name] = state_dict_torchvision.pop(param_name) diff --git a/projects/gesture_recognition/README.md b/projects/gesture_recognition/README.md index 47ca04e472..f3d097c869 100644 --- a/projects/gesture_recognition/README.md +++ b/projects/gesture_recognition/README.md @@ -14,7 +14,7 @@ Hand detection results on OneHand10K validation dataset | Config | Input Size | bbox mAP | bbox mAP 50 | bbox mAP 75 | ckpt | log | | :------------------------------------------------------ | :--------: | :------: | :---------: | :---------: | :---------------------------------------------------: | :--------------------------------------------------: | -| [rtmpose_nano](/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py) | 320x320 | 0.8100 | 0.9870 | 0.9190 | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320_20230524-f6ffed6a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.log) | +| [rtmdet_nano](/projects/gesture_recognition/configs/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.py) | 320x320 | 0.8100 | 0.9870 | 0.9190 | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320_20230524-f6ffed6a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/gesture_recognition/rtmdet-nano_8xb32-300e_multi-dataset-hand-320x320.log) | ## Pose estimation stage diff --git a/tools/data/diving48/README.md b/tools/data/diving48/README.md index 02d19d5a62..0a2b68707f 100644 --- a/tools/data/diving48/README.md +++ b/tools/data/diving48/README.md @@ -21,10 +21,10 @@ For basic dataset information, you can refer to the official dataset [website](h ````{group-tab} Download by MIM MIM supports downloading from OpenDataLab and preprocessing Diving48 dataset with one command line. ```Bash -# install OpenDataLab CLI tools -pip install -U opendatalab -# log in OpenDataLab -odl login +# install OpenXlab CLI tools +pip install -U openxlab +# log in OpenXLab +openxlab login # download and preprocess by MIM mim download mmaction2 --dataset diving48 ``` diff --git a/tools/data/diving48/README_zh-CN.md b/tools/data/diving48/README_zh-CN.md index 825344039e..f6ebca9f2c 100644 --- a/tools/data/diving48/README_zh-CN.md +++ b/tools/data/diving48/README_zh-CN.md @@ -21,10 +21,10 @@ ````{group-tab} 使用 MIM 下载 # MIM 支持下载 Diving48 数据集。用户可以通过一行命令,从 OpenDataLab 进行下载,并进行预处理。 ```Bash -# 安装 OpenDataLab CLI 工具 -pip install -U opendatalab -# 登录 OpenDataLab -odl login +# 安装 OpenXLab CLI 工具 +pip install -U openxlab +# 登录 OpenXLab +openxlab login # 通过 MIM 进行数据集下载,预处理。注意这将花费较长时间 mim download mmaction2 --dataset diving48 ``` diff --git a/tools/data/diving48/preprocess.sh b/tools/data/diving48/preprocess.sh index 10d9e42044..ddd9a1de48 100644 --- a/tools/data/diving48/preprocess.sh +++ b/tools/data/diving48/preprocess.sh @@ -3,6 +3,6 @@ DOWNLOAD_DIR=$1 DATA_ROOT=$2 -cat $DOWNLOAD_DIR/diving48/raw/*.tar.gz.* | tar -xvz -C $(dirname $DATA_ROOT) +cat $DOWNLOAD_DIR/OpenDataLab___diving48/raw/*.tar.gz.* | tar -xvz -C $(dirname $DATA_ROOT) tar -xvf $DATA_ROOT/diving48.tar -C $(dirname $DATA_ROOT) rm $DATA_ROOT/diving48.tar diff --git a/tools/data/kinetics/README.md b/tools/data/kinetics/README.md index 257c650eba..bd96a5f364 100644 --- a/tools/data/kinetics/README.md +++ b/tools/data/kinetics/README.md @@ -38,10 +38,10 @@ All experiments on Kinetics in MMAction2 are based on this version, we recommend MIM supports downloading from OpenDataLab and preprocessing Kinetics-400/600/700 dataset with one command line. ```Bash -# install OpenDataLab CLI tools -pip install -U opendatalab -# log in OpenDataLab -odl login +# install OpenXlab CLI tools +pip install -U openxlab +# log in OpenXLab +openxlab login # download and preprocess Kinetics-400 by MIM. Note that this might take a long time. mim download mmaction2 --dataset kinetics400 # download and preprocess Kinetics-600 by MIM. Note that this might take a long time. diff --git a/tools/data/kinetics/README_zh-CN.md b/tools/data/kinetics/README_zh-CN.md index a0d2e858e1..6047617985 100644 --- a/tools/data/kinetics/README_zh-CN.md +++ b/tools/data/kinetics/README_zh-CN.md @@ -36,10 +36,10 @@ MMAction2 代码仓库中提供的 Kinetics 实验性能,都是基于这个版 # MIM 支持下载 Kinetics-400/600/700 数据集。用户可以通过一行命令,从 OpenDataLab 进行下载,并进行预处理。 ```Bash -# 安装 OpenDataLab CLI 工具 -pip install -U opendatalab -# 登录 OpenDataLab -odl login +# 安装 OpenXLab CLI 工具 +pip install -U openxlab +# 登录 OpenXLab +openxlab login # 通过 MIM 进行 Kinetics-400 数据集下载,预处理。注意这将花费较长时间 mim download mmaction2 --dataset kinetics400 # 通过 MIM 进行 Kinetics-600 数据集下载,预处理。注意这将花费较长时间 diff --git a/tools/data/kinetics/preprocess_k400.sh b/tools/data/kinetics/preprocess_k400.sh index 9f07885095..d48c166375 100644 --- a/tools/data/kinetics/preprocess_k400.sh +++ b/tools/data/kinetics/preprocess_k400.sh @@ -5,5 +5,5 @@ set -x DOWNLOAD_DIR=$1 DATA_ROOT=$2 -cat $DOWNLOAD_DIR/Kinetics-400/raw/*.tar.gz* | tar -xvz -C $(dirname $DATA_ROOT) +cat $DOWNLOAD_DIR/OpenMMLab___Kinetics-400/raw/*.tar.gz* | tar -xvz -C $(dirname $DATA_ROOT) mv $(dirname $DATA_ROOT)/Kinetics-400 $DATA_ROOT diff --git a/tools/data/kinetics/preprocess_k600.sh b/tools/data/kinetics/preprocess_k600.sh index 438297a620..e6264909cb 100644 --- a/tools/data/kinetics/preprocess_k600.sh +++ b/tools/data/kinetics/preprocess_k600.sh @@ -5,5 +5,5 @@ set -x DOWNLOAD_DIR=$1 DATA_ROOT=$2 -cat $DOWNLOAD_DIR/Kinetics600/raw/*.tar.gz* | tar -xvz -C $(dirname $DATA_ROOT) +cat $DOWNLOAD_DIR/OpenMMLab___Kinetics600/raw/*.tar.gz* | tar -xvz -C $(dirname $DATA_ROOT) mv $(dirname $DATA_ROOT)/Kinetics600 $DATA_ROOT diff --git a/tools/data/kinetics/preprocess_k700.sh b/tools/data/kinetics/preprocess_k700.sh index 930bf8577a..8e48cdcd47 100644 --- a/tools/data/kinetics/preprocess_k700.sh +++ b/tools/data/kinetics/preprocess_k700.sh @@ -5,5 +5,5 @@ set -x DOWNLOAD_DIR=$1 DATA_ROOT=$2 -cat $DOWNLOAD_DIR/Kinetics_700/raw/*.tar.gz* | tar -xvz -C $(dirname $DATA_ROOT) +cat $DOWNLOAD_DIR/OpenMMLab___Kinetics_700/raw/*.tar.gz* | tar -xvz -C $(dirname $DATA_ROOT) mv $(dirname $DATA_ROOT)/Kinetics_700 $DATA_ROOT diff --git a/tools/data/kinetics710/README.md b/tools/data/kinetics710/README.md index 76a239fbf5..4dd52b393f 100644 --- a/tools/data/kinetics710/README.md +++ b/tools/data/kinetics710/README.md @@ -25,10 +25,10 @@ Before we start, please make sure that the directory is located at `$MMACTION2`. Kinetics-710 is a video benchmark based on Kinetics-400/600/700, which merges the training set of these Kinetics datasets, and deletes the repeated videos according to Youtube IDs. MMAction2 provides an annotation file based on the Kinetics-400/600/700 on [OpenDataLab](https://opendatalab.com/). So we suggest you download Kinetics-400/600/700 first from OpenDataLab by [MIM](https://github.com/open-mmlab/mim). ```shell -# install OpenDataLab CLI tools -pip install -U opendatalab -# log in OpenDataLab -odl login +# install OpenXlab CLI tools +pip install -U openxlab +# log in OpenXLab +openxlab login # download Kinetics-400/600/700, note that this might take a long time. mim download mmaction2 --dataset kinetics400 mim download mmaction2 --dataset kinetics600 diff --git a/tools/data/kinetics710/README_zh-CN.md b/tools/data/kinetics710/README_zh-CN.md index b428be43df..650a05c7d5 100644 --- a/tools/data/kinetics710/README_zh-CN.md +++ b/tools/data/kinetics710/README_zh-CN.md @@ -23,10 +23,10 @@ Kinetics-710 是基于 Kinetics-400/600/700 的视频数据集,它合并了这些 Kinetics 数据集的训练集,并根据 Youtube ID 删除了重复的视频。MMAction2 提供了一个基于 Kinetics-400/600/700 的 OpenDataLab 版本的标注文件,你可以通过 [MIM](https://github.com/open-mmlab/mim) 从 OpenDataLab 下载。 ```shell -# 安装 OpenDataLab CLI 工具 -pip install -U opendatalab -# 登录 OpenDataLab -odl login +# 安装 OpenXLab CLI 工具 +pip install -U openxlab +# 登录 OpenXLab +openxlab login # 下载 Kinetics-400/600/700,注意这可能需要很长时间。 mim download mmaction2 --dataset kinetics400 mim download mmaction2 --dataset kinetics600 diff --git a/tools/data/sthv2/README.md b/tools/data/sthv2/README.md index 5e05e6ff90..5c6ee953b4 100644 --- a/tools/data/sthv2/README.md +++ b/tools/data/sthv2/README.md @@ -22,10 +22,10 @@ For basic dataset information, you can refer to the dataset [website](https://de ````{group-tab} Download by MIM MIM supports downloading from OpenDataLab and preprocessing Something-Something V2 dataset with one command line. ```Bash -# install OpenDataLab CLI tools -pip install -U opendatalab -# log in OpenDataLab -odl login +# install OpenXlab CLI tools +pip install -U openxlab +# log in OpenXLab +openxlab login # download and preprocess by MIM mim download mmaction2 --dataset sthv2 ``` diff --git a/tools/data/sthv2/README_zh-CN.md b/tools/data/sthv2/README_zh-CN.md index 0bc5baf3fc..83647eb735 100644 --- a/tools/data/sthv2/README_zh-CN.md +++ b/tools/data/sthv2/README_zh-CN.md @@ -22,10 +22,10 @@ ````{group-tab} 使用 MIM 下载 # MIM 支持下载 Something-Something V2 数据集。用户可以通过一行命令,从 OpenDataLab 进行下载,并进行预处理。 ```Bash -# 安装 OpenDataLab CLI 工具 -pip install -U opendatalab -# 登录 OpenDataLab -odl login +# 安装 OpenXLab CLI 工具 +pip install -U openxlab +# 登录 OpenXLab +openxlab login # 通过 MIM 进行数据集下载,预处理。注意这将花费较长时间 mim download mmaction2 --dataset sthv2 ``` diff --git a/tools/data/sthv2/preprocss.sh b/tools/data/sthv2/preprocss.sh index 440a3d42ba..39b574d317 100644 --- a/tools/data/sthv2/preprocss.sh +++ b/tools/data/sthv2/preprocss.sh @@ -3,6 +3,6 @@ DOWNLOAD_DIR=$1 DATA_ROOT=$2 -cat $DOWNLOAD_DIR/sthv2/raw/*.tar.gz | tar -xvz -C $(dirname $DATA_ROOT) +cat $DOWNLOAD_DIR/OpenDataLab___sthv2/raw/*.tar.gz | tar -xvz -C $(dirname $DATA_ROOT) tar -xvf $DATA_ROOT/sthv2.tar -C $(dirname $DATA_ROOT) rm $DATA_ROOT/sthv2.tar