diff --git a/.dev_scripts/benchmark_regression/1-benchmark_valid.py b/.dev_scripts/benchmark_regression/1-benchmark_valid.py index e3cb1ac0057..3fbf16e707d 100644 --- a/.dev_scripts/benchmark_regression/1-benchmark_valid.py +++ b/.dev_scripts/benchmark_regression/1-benchmark_valid.py @@ -128,18 +128,19 @@ def inference(config_file, checkpoint, classes, args): if args.flops: from mmcv.cnn.utils import get_model_complexity_info - if hasattr(model, 'extract_feat'): - model.forward = model.extract_feat - flops, params = get_model_complexity_info( - model, - input_shape=(3, ) + resolution, - print_per_layer_stat=False, - as_strings=args.flops_str) - result['flops'] = flops if args.flops_str else int(flops) - result['params'] = params if args.flops_str else int(params) - else: - result['flops'] = '' - result['params'] = '' + with torch.no_grad(): + if hasattr(model, 'extract_feat'): + model.forward = model.extract_feat + flops, params = get_model_complexity_info( + model, + input_shape=(3, ) + resolution, + print_per_layer_stat=False, + as_strings=args.flops_str) + result['flops'] = flops if args.flops_str else int(flops) + result['params'] = params if args.flops_str else int(params) + else: + result['flops'] = '' + result['params'] = '' return result @@ -199,6 +200,9 @@ def main(args): summary_data = {} for model_name, model_info in models.items(): + if model_info.config is None: + continue + config = Path(model_info.config) assert config.exists(), f'{model_name}: {config} not found.' diff --git a/.dev_scripts/benchmark_regression/2-benchmark_test.py b/.dev_scripts/benchmark_regression/2-benchmark_test.py index bbf316153b0..9274a980711 100644 --- a/.dev_scripts/benchmark_regression/2-benchmark_test.py +++ b/.dev_scripts/benchmark_regression/2-benchmark_test.py @@ -163,6 +163,10 @@ def test(args): preview_script = '' for model_info in models.values(): + + if model_info.results is None: + continue + script_path = create_test_job_batch(commands, model_info, args, port, script_name) preview_script = script_path or preview_script @@ -288,6 +292,9 @@ def summary(args): summary_data = {} for model_name, model_info in models.items(): + if model_info.results is None: + continue + # Skip if not found result file. result_file = work_dir / model_name / 'result.pkl' if not result_file.exists(): diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5ccceeac0ba..ff209335ee0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -9,9 +9,8 @@ on: - 'README.md' - 'README_zh-CN.md' - 'model-index.yml' - - 'configs/**.md' + - 'configs/**' - 'docs/**' - - 'docs_zh-CN/**' - 'demo/**' - '.dev_scripts/**' @@ -20,9 +19,8 @@ on: - 'README.md' - 'README_zh-CN.md' - 'model-index.yml' - - 'configs/**.md' + - 'configs/**' - 'docs/**' - - 'docs_zh-CN/**' - 'demo/**' - '.dev_scripts/**' diff --git a/README.md b/README.md index 50f2801f879..4c774b90870 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,13 @@ The master branch works with **PyTorch 1.5+**. ## What's new +v0.20.0 was released in 30/1/2022. + +Highlights of the new version: +- Support **K-fold cross-validation**. The tutorial will be released later. +- Support **HRNet**, **ConvNeXt**, **Twins** and **EfficientNet**. +- Support model conversion from PyTorch to **Core ML** by a tool. + v0.19.0 was released in 31/12/2021. Highlights of the new version: @@ -68,12 +75,6 @@ Highlights of the new version: - Support **DeiT** & **Conformer** backbone and checkpoints. - Provide a **CAM visualization** tool based on [pytorch-grad-cam](https://github.com/jacobgil/pytorch-grad-cam), and detailed [user guide](https://mmclassification.readthedocs.io/en/latest/tools/visualization.html#class-activation-map-visualization)! -v0.18.0 was released in 30/11/2021. - -Highlights of the new version: -- Support **MLP-Mixer** backbone and provide pre-trained checkpoints. -- Add a tool to **visualize the learning rate curve** of the training phase. Welcome to use with the [tutorial](https://mmclassification.readthedocs.io/en/latest/tools/visualization.html#learning-rate-schedule-visualization)! - Please refer to [changelog.md](docs/en/changelog.md) for more details and other release history. ## Installation @@ -123,9 +124,10 @@ Results and models are available in the [model zoo](https://mmclassification.rea - [x] [DeiT](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit) - [x] [Conformer](https://github.com/open-mmlab/mmclassification/tree/master/configs/conformer) - [x] [T2T-ViT](https://github.com/open-mmlab/mmclassification/tree/master/configs/t2t_vit) -- [ ] EfficientNet -- [ ] Twins -- [ ] HRNet +- [x] [Twins](https://github.com/open-mmlab/mmclassification/tree/master/configs/twins) +- [x] [EfficientNet](https://github.com/open-mmlab/mmclassification/tree/master/configs/efficientnet) +- [x] [ConvNeXt](https://github.com/open-mmlab/mmclassification/tree/master/configs/convnext) +- [x] [HRNet](https://github.com/open-mmlab/mmclassification/tree/master/configs/hrnet) diff --git a/README_zh-CN.md b/README_zh-CN.md index 8fb786c4b31..815ea07d070 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -57,6 +57,13 @@ MMClassification 是一款基于 PyTorch 的开源图像分类工具箱,是 [O ## 更新日志 +2022/1/30 发布了 v0.20.0 版本 + +新版本亮点: +- 支持 **K 折交叉验证** 工具。相应文档会在后续添加。 +- 支持了 **HRNet**,**ConvNeXt**,**Twins** 以及 **EfficientNet** 四个主干网络,欢迎使用! +- 支持了从 PyTorch 模型到 Core-ML 模型的转换工具。 + 2021/12/31 发布了 v0.19.0 版本 新版本亮点: @@ -66,12 +73,6 @@ MMClassification 是一款基于 PyTorch 的开源图像分类工具箱,是 [O - 支持了 **DeiT** 和 **Conformer** 主干网络,并提供了预训练模型。 - 提供了一个 **CAM 可视化** 工具。该工具基于 [pytorch-grad-cam](https://github.com/jacobgil/pytorch-grad-cam),我们提供了详细的 [使用教程](https://mmclassification.readthedocs.io/en/latest/tools/visualization.html#class-activation-map-visualization)! -2021/11/30 发布了 v0.18.0 版本 - -新版本亮点: -- 支持了 **MLP-Mixer** 主干网络,欢迎使用! -- 添加了一个**可视化学习率曲线**的工具,可以参考[教程](https://mmclassification.readthedocs.io/zh_CN/latest/tools/visualization.html#id3)使用 - 发布历史和更新细节请参考 [更新日志](docs/en/changelog.md) ## 安装 @@ -121,9 +122,10 @@ MMClassification 是一款基于 PyTorch 的开源图像分类工具箱,是 [O - [x] [DeiT](https://github.com/open-mmlab/mmclassification/tree/master/configs/deit) - [x] [Conformer](https://github.com/open-mmlab/mmclassification/tree/master/configs/conformer) - [x] [T2T-ViT](https://github.com/open-mmlab/mmclassification/tree/master/configs/t2t_vit) -- [ ] EfficientNet -- [ ] Twins -- [ ] HRNet +- [x] [Twins](https://github.com/open-mmlab/mmclassification/tree/master/configs/twins) +- [x] [EfficientNet](https://github.com/open-mmlab/mmclassification/tree/master/configs/efficientnet) +- [x] [ConvNeXt](https://github.com/open-mmlab/mmclassification/tree/master/configs/convnext) +- [x] [HRNet](https://github.com/open-mmlab/mmclassification/tree/master/configs/hrnet) diff --git a/configs/_base_/models/convnext/convnext-base.py b/configs/_base_/models/convnext/convnext-base.py new file mode 100644 index 00000000000..7fc5ce71a74 --- /dev/null +++ b/configs/_base_/models/convnext/convnext-base.py @@ -0,0 +1,23 @@ +# Model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ConvNeXt', + arch='base', + out_indices=(3, ), + drop_path_rate=0.5, + gap_before_final_norm=True, + init_cfg=[ + dict( + type='TruncNormal', + layer=['Conv2d', 'Linear'], + std=.02, + bias=0.), + dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.), + ]), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1024, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + )) diff --git a/configs/_base_/models/convnext/convnext-large.py b/configs/_base_/models/convnext/convnext-large.py new file mode 100644 index 00000000000..4d9e37c0df9 --- /dev/null +++ b/configs/_base_/models/convnext/convnext-large.py @@ -0,0 +1,23 @@ +# Model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ConvNeXt', + arch='large', + out_indices=(3, ), + drop_path_rate=0.5, + gap_before_final_norm=True, + init_cfg=[ + dict( + type='TruncNormal', + layer=['Conv2d', 'Linear'], + std=.02, + bias=0.), + dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.), + ]), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1536, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + )) diff --git a/configs/_base_/models/convnext/convnext-small.py b/configs/_base_/models/convnext/convnext-small.py new file mode 100644 index 00000000000..989ad1d4e63 --- /dev/null +++ b/configs/_base_/models/convnext/convnext-small.py @@ -0,0 +1,23 @@ +# Model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ConvNeXt', + arch='small', + out_indices=(3, ), + drop_path_rate=0.4, + gap_before_final_norm=True, + init_cfg=[ + dict( + type='TruncNormal', + layer=['Conv2d', 'Linear'], + std=.02, + bias=0.), + dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.), + ]), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=768, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + )) diff --git a/configs/_base_/models/convnext/convnext-tiny.py b/configs/_base_/models/convnext/convnext-tiny.py new file mode 100644 index 00000000000..0b692abb1cb --- /dev/null +++ b/configs/_base_/models/convnext/convnext-tiny.py @@ -0,0 +1,23 @@ +# Model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ConvNeXt', + arch='tiny', + out_indices=(3, ), + drop_path_rate=0.1, + gap_before_final_norm=True, + init_cfg=[ + dict( + type='TruncNormal', + layer=['Conv2d', 'Linear'], + std=.02, + bias=0.), + dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.), + ]), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=768, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + )) diff --git a/configs/_base_/models/convnext/convnext-xlarge.py b/configs/_base_/models/convnext/convnext-xlarge.py new file mode 100644 index 00000000000..0c75e32547b --- /dev/null +++ b/configs/_base_/models/convnext/convnext-xlarge.py @@ -0,0 +1,23 @@ +# Model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='ConvNeXt', + arch='xlarge', + out_indices=(3, ), + drop_path_rate=0.5, + gap_before_final_norm=True, + init_cfg=[ + dict( + type='TruncNormal', + layer=['Conv2d', 'Linear'], + std=.02, + bias=0.), + dict(type='Constant', layer=['LayerNorm'], val=1., bias=0.), + ]), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=2048, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + )) diff --git a/configs/_base_/models/efficientnet_b0.py b/configs/_base_/models/efficientnet_b0.py new file mode 100644 index 00000000000..d9ba685306c --- /dev/null +++ b/configs/_base_/models/efficientnet_b0.py @@ -0,0 +1,12 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='EfficientNet', arch='b0'), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1280, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/efficientnet_b1.py b/configs/_base_/models/efficientnet_b1.py new file mode 100644 index 00000000000..63e15c88b2f --- /dev/null +++ b/configs/_base_/models/efficientnet_b1.py @@ -0,0 +1,12 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='EfficientNet', arch='b1'), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1280, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/efficientnet_b2.py b/configs/_base_/models/efficientnet_b2.py new file mode 100644 index 00000000000..5edcfa5d5b6 --- /dev/null +++ b/configs/_base_/models/efficientnet_b2.py @@ -0,0 +1,12 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='EfficientNet', arch='b2'), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1408, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/efficientnet_b3.py b/configs/_base_/models/efficientnet_b3.py new file mode 100644 index 00000000000..c7c6d6d899e --- /dev/null +++ b/configs/_base_/models/efficientnet_b3.py @@ -0,0 +1,12 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='EfficientNet', arch='b3'), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1536, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/efficientnet_b4.py b/configs/_base_/models/efficientnet_b4.py new file mode 100644 index 00000000000..06840ed559c --- /dev/null +++ b/configs/_base_/models/efficientnet_b4.py @@ -0,0 +1,12 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='EfficientNet', arch='b4'), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1792, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/efficientnet_b5.py b/configs/_base_/models/efficientnet_b5.py new file mode 100644 index 00000000000..a86eebd1904 --- /dev/null +++ b/configs/_base_/models/efficientnet_b5.py @@ -0,0 +1,12 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='EfficientNet', arch='b5'), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=2048, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/efficientnet_b6.py b/configs/_base_/models/efficientnet_b6.py new file mode 100644 index 00000000000..4eada1d3251 --- /dev/null +++ b/configs/_base_/models/efficientnet_b6.py @@ -0,0 +1,12 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='EfficientNet', arch='b6'), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=2304, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/efficientnet_b7.py b/configs/_base_/models/efficientnet_b7.py new file mode 100644 index 00000000000..1d84ba427f4 --- /dev/null +++ b/configs/_base_/models/efficientnet_b7.py @@ -0,0 +1,12 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='EfficientNet', arch='b7'), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=2560, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/efficientnet_b8.py b/configs/_base_/models/efficientnet_b8.py new file mode 100644 index 00000000000..c9500644dae --- /dev/null +++ b/configs/_base_/models/efficientnet_b8.py @@ -0,0 +1,12 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='EfficientNet', arch='b8'), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=2816, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/efficientnet_em.py b/configs/_base_/models/efficientnet_em.py new file mode 100644 index 00000000000..abecdbeef6c --- /dev/null +++ b/configs/_base_/models/efficientnet_em.py @@ -0,0 +1,13 @@ +# model settings +model = dict( + type='ImageClassifier', + # `em` means EfficientNet-EdgeTPU-M arch + backbone=dict(type='EfficientNet', arch='em', act_cfg=dict(type='ReLU')), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1280, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/efficientnet_es.py b/configs/_base_/models/efficientnet_es.py new file mode 100644 index 00000000000..911ba4a1826 --- /dev/null +++ b/configs/_base_/models/efficientnet_es.py @@ -0,0 +1,13 @@ +# model settings +model = dict( + type='ImageClassifier', + # `es` means EfficientNet-EdgeTPU-S arch + backbone=dict(type='EfficientNet', arch='es', act_cfg=dict(type='ReLU')), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=1280, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/hrnet/hrnet-w18.py b/configs/_base_/models/hrnet/hrnet-w18.py new file mode 100644 index 00000000000..f7fbf298d5b --- /dev/null +++ b/configs/_base_/models/hrnet/hrnet-w18.py @@ -0,0 +1,15 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='HRNet', arch='w18'), + neck=[ + dict(type='HRFuseScales', in_channels=(18, 36, 72, 144)), + dict(type='GlobalAveragePooling'), + ], + head=dict( + type='LinearClsHead', + in_channels=2048, + num_classes=1000, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/hrnet/hrnet-w30.py b/configs/_base_/models/hrnet/hrnet-w30.py new file mode 100644 index 00000000000..babcacac59a --- /dev/null +++ b/configs/_base_/models/hrnet/hrnet-w30.py @@ -0,0 +1,15 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='HRNet', arch='w30'), + neck=[ + dict(type='HRFuseScales', in_channels=(30, 60, 120, 240)), + dict(type='GlobalAveragePooling'), + ], + head=dict( + type='LinearClsHead', + in_channels=2048, + num_classes=1000, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/hrnet/hrnet-w32.py b/configs/_base_/models/hrnet/hrnet-w32.py new file mode 100644 index 00000000000..2c1e980048d --- /dev/null +++ b/configs/_base_/models/hrnet/hrnet-w32.py @@ -0,0 +1,15 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='HRNet', arch='w32'), + neck=[ + dict(type='HRFuseScales', in_channels=(32, 64, 128, 256)), + dict(type='GlobalAveragePooling'), + ], + head=dict( + type='LinearClsHead', + in_channels=2048, + num_classes=1000, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/hrnet/hrnet-w40.py b/configs/_base_/models/hrnet/hrnet-w40.py new file mode 100644 index 00000000000..83f65d86467 --- /dev/null +++ b/configs/_base_/models/hrnet/hrnet-w40.py @@ -0,0 +1,15 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='HRNet', arch='w40'), + neck=[ + dict(type='HRFuseScales', in_channels=(40, 80, 160, 320)), + dict(type='GlobalAveragePooling'), + ], + head=dict( + type='LinearClsHead', + in_channels=2048, + num_classes=1000, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/hrnet/hrnet-w44.py b/configs/_base_/models/hrnet/hrnet-w44.py new file mode 100644 index 00000000000..e75dc0f891f --- /dev/null +++ b/configs/_base_/models/hrnet/hrnet-w44.py @@ -0,0 +1,15 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='HRNet', arch='w44'), + neck=[ + dict(type='HRFuseScales', in_channels=(44, 88, 176, 352)), + dict(type='GlobalAveragePooling'), + ], + head=dict( + type='LinearClsHead', + in_channels=2048, + num_classes=1000, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/hrnet/hrnet-w48.py b/configs/_base_/models/hrnet/hrnet-w48.py new file mode 100644 index 00000000000..f0604958481 --- /dev/null +++ b/configs/_base_/models/hrnet/hrnet-w48.py @@ -0,0 +1,15 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='HRNet', arch='w48'), + neck=[ + dict(type='HRFuseScales', in_channels=(48, 96, 192, 384)), + dict(type='GlobalAveragePooling'), + ], + head=dict( + type='LinearClsHead', + in_channels=2048, + num_classes=1000, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/hrnet/hrnet-w64.py b/configs/_base_/models/hrnet/hrnet-w64.py new file mode 100644 index 00000000000..844c3fe9413 --- /dev/null +++ b/configs/_base_/models/hrnet/hrnet-w64.py @@ -0,0 +1,15 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict(type='HRNet', arch='w64'), + neck=[ + dict(type='HRFuseScales', in_channels=(64, 128, 256, 512)), + dict(type='GlobalAveragePooling'), + ], + head=dict( + type='LinearClsHead', + in_channels=2048, + num_classes=1000, + loss=dict(type='CrossEntropyLoss', loss_weight=1.0), + topk=(1, 5), + )) diff --git a/configs/_base_/models/twins_pcpvt_base.py b/configs/_base_/models/twins_pcpvt_base.py new file mode 100644 index 00000000000..473d7ee817f --- /dev/null +++ b/configs/_base_/models/twins_pcpvt_base.py @@ -0,0 +1,30 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='PCPVT', + arch='base', + in_channels=3, + out_indices=(3, ), + qkv_bias=True, + norm_cfg=dict(type='LN', eps=1e-06), + norm_after_stage=[False, False, False, True], + drop_rate=0.0, + attn_drop_rate=0., + drop_path_rate=0.3), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=512, + loss=dict( + type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), + cal_acc=False), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], + train_cfg=dict(augments=[ + dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5), + dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5) + ])) diff --git a/configs/_base_/models/twins_svt_base.py b/configs/_base_/models/twins_svt_base.py new file mode 100644 index 00000000000..cabd373961b --- /dev/null +++ b/configs/_base_/models/twins_svt_base.py @@ -0,0 +1,30 @@ +# model settings +model = dict( + type='ImageClassifier', + backbone=dict( + type='SVT', + arch='base', + in_channels=3, + out_indices=(3, ), + qkv_bias=True, + norm_cfg=dict(type='LN'), + norm_after_stage=[False, False, False, True], + drop_rate=0.0, + attn_drop_rate=0., + drop_path_rate=0.3), + neck=dict(type='GlobalAveragePooling'), + head=dict( + type='LinearClsHead', + num_classes=1000, + in_channels=768, + loss=dict( + type='LabelSmoothLoss', label_smooth_val=0.1, mode='original'), + cal_acc=False), + init_cfg=[ + dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.), + dict(type='Constant', layer='LayerNorm', val=1., bias=0.) + ], + train_cfg=dict(augments=[ + dict(type='BatchMixup', alpha=0.8, num_classes=1000, prob=0.5), + dict(type='BatchCutMix', alpha=1.0, num_classes=1000, prob=0.5) + ])) diff --git a/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py b/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py index 1a523e44ddd..8ae7042c02a 100644 --- a/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py +++ b/configs/_base_/schedules/imagenet_bs1024_adamw_swin.py @@ -24,7 +24,7 @@ min_lr_ratio=1e-2, warmup='linear', warmup_ratio=1e-3, - warmup_iters=20 * 1252, - warmup_by_epoch=False) + warmup_iters=20, + warmup_by_epoch=True) runner = dict(type='EpochBasedRunner', max_epochs=300) diff --git a/configs/conformer/README.md b/configs/conformer/README.md index 596911a0aed..ff91ed2081e 100644 --- a/configs/conformer/README.md +++ b/configs/conformer/README.md @@ -1,28 +1,16 @@ -# Conformer: Local Features Coupling Global Representations for Visual Recognition - +# Conformer + +> [Conformer: Local Features Coupling Global Representations for Visual Recognition](https://arxiv.org/abs/2105.03889) ## Abstract - Within Convolutional Neural Network (CNN), the convolution operations are good at extracting local features but experience difficulty to capture global representations. Within visual transformer, the cascaded self-attention modules can capture long-distance feature dependencies but unfortunately deteriorate local feature details. In this paper, we propose a hybrid network structure, termed Conformer, to take advantage of convolutional operations and self-attention mechanisms for enhanced representation learning. Conformer roots in the Feature Coupling Unit (FCU), which fuses local features and global representations under different resolutions in an interactive fashion. Conformer adopts a concurrent structure so that local features and global representations are retained to the maximum extent. Experiments show that Conformer, under the comparable parameter complexity, outperforms the visual transformer (DeiT-B) by 2.3% on ImageNet. On MSCOCO, it outperforms ResNet-101 by 3.7% and 3.6% mAPs for object detection and instance segmentation, respectively, demonstrating the great potential to be a general backbone network. -
-## Citation - -```latex -@article{peng2021conformer, - title={Conformer: Local Features Coupling Global Representations for Visual Recognition}, - author={Zhiliang Peng and Wei Huang and Shanzhi Gu and Lingxi Xie and Yaowei Wang and Jianbin Jiao and Qixiang Ye}, - journal={arXiv preprint arXiv:2105.03889}, - year={2021}, -} -``` - ## Results and models ### ImageNet-1k @@ -35,3 +23,14 @@ Within Convolutional Neural Network (CNN), the convolution operations are good a | Conformer-base-p16\* | 83.29 | 22.89 | 83.82 | 96.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-base-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth) | *Models with \* are converted from the [official repo](https://github.com/pengzhiliang/Conformer). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +## Citation + +``` +@article{peng2021conformer, + title={Conformer: Local Features Coupling Global Representations for Visual Recognition}, + author={Zhiliang Peng and Wei Huang and Shanzhi Gu and Lingxi Xie and Yaowei Wang and Jianbin Jiao and Qixiang Ye}, + journal={arXiv preprint arXiv:2105.03889}, + year={2021}, +} +``` diff --git a/configs/conformer/metafile.yml b/configs/conformer/metafile.yml index 31d28740756..4efe05fb8fd 100644 --- a/configs/conformer/metafile.yml +++ b/configs/conformer/metafile.yml @@ -10,9 +10,9 @@ Collections: URL: https://arxiv.org/abs/2105.03889 Title: "Conformer: Local Features Coupling Global Representations for Visual Recognition" README: configs/conformer/README.md -# Code: -# URL: # todo -# Version: # todo + Code: + URL: https://github.com/open-mmlab/mmclassification/blob/v0.19.0/mmcls/models/backbones/conformer.py + Version: v0.19.0 Models: - Name: conformer-tiny-p16_3rdparty_8xb128_in1k diff --git a/configs/convnext/README.md b/configs/convnext/README.md new file mode 100644 index 00000000000..fee44db57cd --- /dev/null +++ b/configs/convnext/README.md @@ -0,0 +1,53 @@ +# ConvNeXt + +> [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545v1) + + +## Abstract + + +The "Roaring 20s" of visual recognition began with the introduction of Vision Transformers (ViTs), which quickly superseded ConvNets as the state-of-the-art image classification model. A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers (e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide variety of vision tasks. However, the effectiveness of such hybrid approaches is still largely credited to the intrinsic superiority of Transformers, rather than the inherent inductive biases of convolutions. In this work, we reexamine the design spaces and test the limits of what a pure ConvNet can achieve. We gradually "modernize" a standard ResNet toward the design of a vision Transformer, and discover several key components that contribute to the performance difference along the way. The outcome of this exploration is a family of pure ConvNet models dubbed ConvNeXt. Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets. + + +
+ +
+ +## Results and models + +### ImageNet-1k + +| Model | Pretrain | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +|:---------------:|:------------:|:---------:|:--------:|:---------:|:---------:|:------:|:--------:| +| ConvNeXt-T\* | From scratch | 28.59 | 4.46 | 82.05 | 95.86 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-tiny_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128_in1k_20220124-18abde00.pth) | +| ConvNeXt-S\* | From scratch | 50.22 | 8.69 | 83.13 | 96.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-small_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128_in1k_20220124-d39b5192.pth) | +| ConvNeXt-B\* | From scratch | 88.59 | 15.36 | 83.85 | 96.74 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) | +| ConvNeXt-B\* | ImageNet-21k | 88.59 | 15.36 | 85.81 | 97.86 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) | +| ConvNeXt-L\* | From scratch | 197.77 | 34.37 | 84.30 | 96.89 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) | +| ConvNeXt-L\* | ImageNet-21k | 197.77 | 34.37 | 86.61 | 98.04 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) | +| ConvNeXt-XL\* | ImageNet-21k | 350.20 | 60.93 | 86.97 | 98.20 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-xlarge_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) | + +*Models with \* are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +### ImageNet-21k + +The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don't have evaluation results. + +| Model | Params(M) | Flops(G) | Download | +|:--------------------------------:|:---------:|:--------:|:--------:| +| convnext-base_3rdparty_in21k\* | 88.59 | 15.36 | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in21k_20220124-13b83eec.pth) | +| convnext-large_3rdparty_in21k\* | 197.77 | 34.37 | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in21k_20220124-41b5a79f.pth) | +| convnext-xlarge_3rdparty_in21k\* | 350.20 | 60.93 | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_3rdparty_in21k_20220124-f909bad7.pth) | + +*Models with \* are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt).* + +## Citation + +```bibtex +@Article{liu2022convnet, + author = {Zhuang Liu and Hanzi Mao and Chao-Yuan Wu and Christoph Feichtenhofer and Trevor Darrell and Saining Xie}, + title = {A ConvNet for the 2020s}, + journal = {arXiv preprint arXiv:2201.03545}, + year = {2022}, +} +``` diff --git a/configs/convnext/convnext-base_32xb128_in1k.py b/configs/convnext/convnext-base_32xb128_in1k.py new file mode 100644 index 00000000000..6c0450a4341 --- /dev/null +++ b/configs/convnext/convnext-base_32xb128_in1k.py @@ -0,0 +1,12 @@ +_base_ = [ + '../_base_/models/convnext/convnext-base.py', + '../_base_/datasets/imagenet_bs64_swin_224.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py', +] + +data = dict(samples_per_gpu=128) + +optimizer = dict(lr=4e-3) + +custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')] diff --git a/configs/convnext/convnext-large_64xb64_in1k.py b/configs/convnext/convnext-large_64xb64_in1k.py new file mode 100644 index 00000000000..1faae253436 --- /dev/null +++ b/configs/convnext/convnext-large_64xb64_in1k.py @@ -0,0 +1,12 @@ +_base_ = [ + '../_base_/models/convnext/convnext-large.py', + '../_base_/datasets/imagenet_bs64_swin_224.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py', +] + +data = dict(samples_per_gpu=64) + +optimizer = dict(lr=4e-3) + +custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')] diff --git a/configs/convnext/convnext-small_32xb128_in1k.py b/configs/convnext/convnext-small_32xb128_in1k.py new file mode 100644 index 00000000000..d820fc6cac9 --- /dev/null +++ b/configs/convnext/convnext-small_32xb128_in1k.py @@ -0,0 +1,12 @@ +_base_ = [ + '../_base_/models/convnext/convnext-small.py', + '../_base_/datasets/imagenet_bs64_swin_224.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py', +] + +data = dict(samples_per_gpu=128) + +optimizer = dict(lr=4e-3) + +custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')] diff --git a/configs/convnext/convnext-tiny_32xb128_in1k.py b/configs/convnext/convnext-tiny_32xb128_in1k.py new file mode 100644 index 00000000000..46d0185d8ab --- /dev/null +++ b/configs/convnext/convnext-tiny_32xb128_in1k.py @@ -0,0 +1,12 @@ +_base_ = [ + '../_base_/models/convnext/convnext-tiny.py', + '../_base_/datasets/imagenet_bs64_swin_224.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py', +] + +data = dict(samples_per_gpu=128) + +optimizer = dict(lr=4e-3) + +custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')] diff --git a/configs/convnext/convnext-xlarge_64xb64_in1k.py b/configs/convnext/convnext-xlarge_64xb64_in1k.py new file mode 100644 index 00000000000..72849013df6 --- /dev/null +++ b/configs/convnext/convnext-xlarge_64xb64_in1k.py @@ -0,0 +1,12 @@ +_base_ = [ + '../_base_/models/convnext/convnext-xlarge.py', + '../_base_/datasets/imagenet_bs64_swin_224.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py', +] + +data = dict(samples_per_gpu=64) + +optimizer = dict(lr=4e-3) + +custom_hooks = [dict(type='EMAHook', momentum=4e-5, priority='ABOVE_NORMAL')] diff --git a/configs/convnext/metafile.yml b/configs/convnext/metafile.yml new file mode 100644 index 00000000000..3825dc4a5e1 --- /dev/null +++ b/configs/convnext/metafile.yml @@ -0,0 +1,167 @@ +Collections: + - Name: ConvNeXt + Metadata: + Training Data: ImageNet-1k + Architecture: + - 1x1 Convolution + - LayerScale + Paper: + URL: https://arxiv.org/abs/2201.03545v1 + Title: A ConvNet for the 2020s + README: configs/convnext/README.md + Code: + Version: v0.20.0 + URL: https://github.com/open-mmlab/mmclassification/blob/v0.20.0/mmcls/models/backbones/convnext.py + +Models: + - Name: convnext-tiny_3rdparty_32xb128_in1k + Metadata: + FLOPs: 4457472768 + Parameters: 28589128 + In Collections: ConvNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 82.05 + Top 5 Accuracy: 95.86 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128_in1k_20220124-18abde00.pth + Config: configs/convnext/convnext-tiny_32xb128_in1k.py + Converted From: + Weights: https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth + Code: https://github.com/facebookresearch/ConvNeXt + - Name: convnext-small_3rdparty_32xb128_in1k + Metadata: + FLOPs: 8687008512 + Parameters: 50223688 + In Collections: ConvNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.13 + Top 5 Accuracy: 96.44 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128_in1k_20220124-d39b5192.pth + Config: configs/convnext/convnext-small_32xb128_in1k.py + Converted From: + Weights: https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth + Code: https://github.com/facebookresearch/ConvNeXt + - Name: convnext-base_3rdparty_32xb128_in1k + Metadata: + FLOPs: 15359124480 + Parameters: 88591464 + In Collections: ConvNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.85 + Top 5 Accuracy: 96.74 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth + Config: configs/convnext/convnext-base_32xb128_in1k.py + Converted From: + Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth + Code: https://github.com/facebookresearch/ConvNeXt + - Name: convnext-base_3rdparty_in21k + Metadata: + Training Data: ImageNet-21k + FLOPs: 15359124480 + Parameters: 88591464 + In Collections: ConvNeXt + Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_in21k_20220124-13b83eec.pth + Converted From: + Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth + Code: https://github.com/facebookresearch/ConvNeXt + - Name: convnext-base_in21k-pre-3rdparty_32xb128_in1k + Metadata: + Training Data: + - ImageNet-21k + - ImageNet-1k + FLOPs: 15359124480 + Parameters: 88591464 + In Collections: ConvNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 85.81 + Top 5 Accuracy: 97.86 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth + Config: configs/convnext/convnext-base_32xb128_in1k.py + Converted From: + Weights: https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth + Code: https://github.com/facebookresearch/ConvNeXt + - Name: convnext-large_3rdparty_64xb64_in1k + Metadata: + FLOPs: 34368026112 + Parameters: 197767336 + In Collections: ConvNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 84.30 + Top 5 Accuracy: 96.89 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth + Config: configs/convnext/convnext-large_64xb64_in1k.py + Converted From: + Weights: https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth + Code: https://github.com/facebookresearch/ConvNeXt + - Name: convnext-large_3rdparty_in21k + Metadata: + Training Data: ImageNet-21k + FLOPs: 34368026112 + Parameters: 197767336 + In Collections: ConvNeXt + Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_in21k_20220124-41b5a79f.pth + Converted From: + Weights: https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth + Code: https://github.com/facebookresearch/ConvNeXt + - Name: convnext-large_in21k-pre-3rdparty_64xb64_in1k + Metadata: + Training Data: + - ImageNet-21k + - ImageNet-1k + FLOPs: 34368026112 + Parameters: 197767336 + In Collections: ConvNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 86.61 + Top 5 Accuracy: 98.04 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth + Config: configs/convnext/convnext-large_64xb64_in1k.py + Converted From: + Weights: https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth + Code: https://github.com/facebookresearch/ConvNeXt + - Name: convnext-xlarge_3rdparty_in21k + Metadata: + Training Data: ImageNet-21k + FLOPs: 60929820672 + Parameters: 350196968 + In Collections: ConvNeXt + Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_3rdparty_in21k_20220124-f909bad7.pth + Converted From: + Weights: https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth + Code: https://github.com/facebookresearch/ConvNeXt + - Name: convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k + Metadata: + Training Data: + - ImageNet-21k + - ImageNet-1k + FLOPs: 60929820672 + Parameters: 350196968 + In Collections: ConvNeXt + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 86.97 + Top 5 Accuracy: 98.20 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth + Config: configs/convnext/convnext-xlarge_64xb64_in1k.py + Converted From: + Weights: https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth + Code: https://github.com/facebookresearch/ConvNeXt diff --git a/configs/deit/README.md b/configs/deit/README.md index 52a8be667a7..e488c4dc3b3 100644 --- a/configs/deit/README.md +++ b/configs/deit/README.md @@ -1,30 +1,16 @@ -# Training data-efficient image transformers & distillation through attention - +# DeiT + +> [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) ## Abstract - Recently, neural networks purely based on attention were shown to address image understanding tasks such as image classification. However, these visual transformers are pre-trained with hundreds of millions of images using an expensive infrastructure, thereby limiting their adoption. In this work, we produce a competitive convolution-free transformer by training on Imagenet only. We train them on a single computer in less than 3 days. Our reference vision transformer (86M parameters) achieves top-1 accuracy of 83.1% (single-crop evaluation) on ImageNet with no external data. More importantly, we introduce a teacher-student strategy specific to transformers. It relies on a distillation token ensuring that the student learns from the teacher through attention. We show the interest of this token-based distillation, especially when using a convnet as a teacher. This leads us to report results competitive with convnets for both Imagenet (where we obtain up to 85.2% accuracy) and when transferring to other tasks. We share our code and models. -
-## Citation -```{latex} -@InProceedings{pmlr-v139-touvron21a, - title = {Training data-efficient image transformers & distillation through attention}, - author = {Touvron, Hugo and Cord, Matthieu and Douze, Matthijs and Massa, Francisco and Sablayrolles, Alexandre and Jegou, Herve}, - booktitle = {International Conference on Machine Learning}, - pages = {10347--10357}, - year = {2021}, - volume = {139}, - month = {July} -} -``` - ## Results and models ### ImageNet-1k @@ -48,3 +34,17 @@ The teacher of the distilled version DeiT is RegNetY-16GF. MMClassification doesn't support training the distilled version DeiT. And we provide distilled version checkpoints for inference only. ``` + +## Citation + +``` +@InProceedings{pmlr-v139-touvron21a, + title = {Training data-efficient image transformers & distillation through attention}, + author = {Touvron, Hugo and Cord, Matthieu and Douze, Matthijs and Massa, Francisco and Sablayrolles, Alexandre and Jegou, Herve}, + booktitle = {International Conference on Machine Learning}, + pages = {10347--10357}, + year = {2021}, + volume = {139}, + month = {July} +} +``` diff --git a/configs/deit/metafile.yml b/configs/deit/metafile.yml index 7d1980224ba..33a90dacbb6 100644 --- a/configs/deit/metafile.yml +++ b/configs/deit/metafile.yml @@ -11,6 +11,9 @@ Collections: URL: https://arxiv.org/abs/2012.12877 Title: "Training data-efficient image transformers & distillation through attention" README: configs/deit/README.md + Code: + URL: v0.19.0 + Version: https://github.com/open-mmlab/mmclassification/blob/v0.19.0/mmcls/models/backbones/deit.py Models: - Name: deit-tiny_3rdparty_pt-4xb256_in1k diff --git a/configs/efficientnet/README.md b/configs/efficientnet/README.md new file mode 100644 index 00000000000..846ff564e2b --- /dev/null +++ b/configs/efficientnet/README.md @@ -0,0 +1,61 @@ +# EfficientNet + +> [Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946v5) + + +## Abstract + +Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are available. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on scaling up MobileNets and ResNet. To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves state-of-the-art 84.3% top-1 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet. Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flowers (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters. + +
+ +
+ +## Results and models + +### ImageNet-1k + +In the result table, AA means trained with AutoAugment pre-processing, more details can be found in the [paper](https://arxiv.org/abs/1805.09501), and AdvProp is a method to train with adversarial examples, more details can be found in the [paper](https://arxiv.org/abs/1911.09665). + +Note: In MMClassification, we support training with AutoAugment, don't support AdvProp by now. + +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +|:---------------------:|:---------:|:--------:|:---------:|:---------:|:------:|:--------:| +| EfficientNet-B0\* | 5.29 | 0.02 | 76.74 | 93.17 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32_in1k_20220119-a7e2a0b1.pth) | +| EfficientNet-B0 (AA)\* | 5.29 | 0.02 | 77.26 | 93.41 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa_in1k_20220119-8d939117.pth) | +| EfficientNet-B0 (AA + AdvProp)\* | 5.29 | 0.02 | 77.53 | 93.61 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa-advprop_in1k_20220119-26434485.pth) | +| EfficientNet-B1\* | 7.79 | 0.03 | 78.68 | 94.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32_in1k_20220119-002556d9.pth) | +| EfficientNet-B1 (AA)\* | 7.79 | 0.03 | 79.20 | 94.42 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa_in1k_20220119-619d8ae3.pth) | +| EfficientNet-B1 (AA + AdvProp)\* | 7.79 | 0.03 | 79.52 | 94.43 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa-advprop_in1k_20220119-5715267d.pth) | +| EfficientNet-B2\* | 9.11 | 0.03 | 79.64 | 94.80 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32_in1k_20220119-ea374a30.pth) | +| EfficientNet-B2 (AA)\* | 9.11 | 0.03 | 80.21 | 94.96 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa_in1k_20220119-dd61e80b.pth) | +| EfficientNet-B2 (AA + AdvProp)\* | 9.11 | 0.03 | 80.45 | 95.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa-advprop_in1k_20220119-1655338a.pth) | +| EfficientNet-B3\* | 12.23 | 0.06 | 81.01 | 95.34 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32_in1k_20220119-4b4d7487.pth) | +| EfficientNet-B3 (AA)\* | 12.23 | 0.06 | 81.58 | 95.67 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa_in1k_20220119-5b4887a0.pth) | +| EfficientNet-B3 (AA + AdvProp)\* | 12.23 | 0.06 | 81.81 | 95.69 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa-advprop_in1k_20220119-53b41118.pth) | +| EfficientNet-B4\* | 19.34 | 0.12 | 82.57 | 96.09 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32_in1k_20220119-81fd4077.pth) | +| EfficientNet-B4 (AA)\* | 19.34 | 0.12 | 82.95 | 96.26 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa_in1k_20220119-45b8bd2b.pth) | +| EfficientNet-B4 (AA + AdvProp)\* | 19.34 | 0.12 | 83.25 | 96.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa-advprop_in1k_20220119-38c2238c.pth) | +| EfficientNet-B5\* | 30.39 | 0.24 | 83.18 | 96.47 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32_in1k_20220119-e9814430.pth) | +| EfficientNet-B5 (AA)\* | 30.39 | 0.24 | 83.82 | 96.76 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa_in1k_20220119-2cab8b78.pth) | +| EfficientNet-B5 (AA + AdvProp)\* | 30.39 | 0.24 | 84.21 | 96.98 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa-advprop_in1k_20220119-f57a895a.pth) | +| EfficientNet-B6 (AA)\* | 43.04 | 0.41 | 84.05 | 96.82 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b6_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa_in1k_20220119-45b03310.pth) | +| EfficientNet-B6 (AA + AdvProp)\* | 43.04 | 0.41 | 84.74 | 97.14 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa-advprop_in1k_20220119-bfe3485e.pth) | +| EfficientNet-B7 (AA)\* | 66.35 | 0.72 | 84.38 | 96.88 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b7_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa_in1k_20220119-bf03951c.pth) | +| EfficientNet-B7 (AA + AdvProp)\* | 66.35 | 0.72 | 85.14 | 97.23 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa-advprop_in1k_20220119-c6dbff10.pth) | +| EfficientNet-B8 (AA + AdvProp)\* | 87.41 | 1.09 | 85.38 | 97.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b8_3rdparty_8xb32-aa-advprop_in1k_20220119-297ce1b7.pth) | + +*Models with \* are converted from the [official repo](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +## Citation + +``` +@inproceedings{tan2019efficientnet, + title={Efficientnet: Rethinking model scaling for convolutional neural networks}, + author={Tan, Mingxing and Le, Quoc}, + booktitle={International Conference on Machine Learning}, + pages={6105--6114}, + year={2019}, + organization={PMLR} +} +``` diff --git a/configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py new file mode 100644 index 00000000000..fbb490d9ad0 --- /dev/null +++ b/configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b0.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=224, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=224, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b0_8xb32_in1k.py b/configs/efficientnet/efficientnet-b0_8xb32_in1k.py new file mode 100644 index 00000000000..33931e5f873 --- /dev/null +++ b/configs/efficientnet/efficientnet-b0_8xb32_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b0.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=224, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=224, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py new file mode 100644 index 00000000000..6b66395cbf4 --- /dev/null +++ b/configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b1.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=240, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=240, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b1_8xb32_in1k.py b/configs/efficientnet/efficientnet-b1_8xb32_in1k.py new file mode 100644 index 00000000000..d702a15016a --- /dev/null +++ b/configs/efficientnet/efficientnet-b1_8xb32_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b1.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=240, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=240, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py new file mode 100644 index 00000000000..ae8cda84997 --- /dev/null +++ b/configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b2.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=260, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=260, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b2_8xb32_in1k.py b/configs/efficientnet/efficientnet-b2_8xb32_in1k.py new file mode 100644 index 00000000000..53f7c84dca8 --- /dev/null +++ b/configs/efficientnet/efficientnet-b2_8xb32_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b2.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=260, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=260, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py new file mode 100644 index 00000000000..dfd3f92c75c --- /dev/null +++ b/configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b3.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=300, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=300, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b3_8xb32_in1k.py b/configs/efficientnet/efficientnet-b3_8xb32_in1k.py new file mode 100644 index 00000000000..28387138c41 --- /dev/null +++ b/configs/efficientnet/efficientnet-b3_8xb32_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b3.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=300, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=300, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py new file mode 100644 index 00000000000..333a19ac851 --- /dev/null +++ b/configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b4.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=380, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=380, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b4_8xb32_in1k.py b/configs/efficientnet/efficientnet-b4_8xb32_in1k.py new file mode 100644 index 00000000000..82f06cde100 --- /dev/null +++ b/configs/efficientnet/efficientnet-b4_8xb32_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b4.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=380, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=380, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py new file mode 100644 index 00000000000..f66855c516f --- /dev/null +++ b/configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b5.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=456, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=456, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b5_8xb32_in1k.py b/configs/efficientnet/efficientnet-b5_8xb32_in1k.py new file mode 100644 index 00000000000..9b0eaab010b --- /dev/null +++ b/configs/efficientnet/efficientnet-b5_8xb32_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b5.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=456, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=456, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py new file mode 100644 index 00000000000..da64e0ec8ea --- /dev/null +++ b/configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b6.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=528, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=528, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b6_8xb32_in1k.py b/configs/efficientnet/efficientnet-b6_8xb32_in1k.py new file mode 100644 index 00000000000..6e03bb4cfd6 --- /dev/null +++ b/configs/efficientnet/efficientnet-b6_8xb32_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b6.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=528, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=528, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py new file mode 100644 index 00000000000..27c19fc709b --- /dev/null +++ b/configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b7.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=600, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=600, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b7_8xb32_in1k.py b/configs/efficientnet/efficientnet-b7_8xb32_in1k.py new file mode 100644 index 00000000000..5146383e645 --- /dev/null +++ b/configs/efficientnet/efficientnet-b7_8xb32_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b7.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=600, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=600, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py new file mode 100644 index 00000000000..25540a1a604 --- /dev/null +++ b/configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b8.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=672, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=672, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-b8_8xb32_in1k.py b/configs/efficientnet/efficientnet-b8_8xb32_in1k.py new file mode 100644 index 00000000000..4ff28c01b23 --- /dev/null +++ b/configs/efficientnet/efficientnet-b8_8xb32_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_b8.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=672, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=672, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-em_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-em_8xb32-01norm_in1k.py new file mode 100644 index 00000000000..faa53862125 --- /dev/null +++ b/configs/efficientnet/efficientnet-em_8xb32-01norm_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_em.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=240, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=240, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/efficientnet-es_8xb32-01norm_in1k.py b/configs/efficientnet/efficientnet-es_8xb32-01norm_in1k.py new file mode 100644 index 00000000000..5f11746fc99 --- /dev/null +++ b/configs/efficientnet/efficientnet-es_8xb32-01norm_in1k.py @@ -0,0 +1,39 @@ +_base_ = [ + '../_base_/models/efficientnet_es.py', + '../_base_/datasets/imagenet_bs32.py', + '../_base_/schedules/imagenet_bs256.py', + '../_base_/default_runtime.py', +] + +# dataset settings +dataset_type = 'ImageNet' +img_norm_cfg = dict( + mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='RandomResizedCrop', + size=224, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_label']), + dict(type='Collect', keys=['img', 'gt_label']) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='CenterCrop', + crop_size=224, + efficientnet_style=True, + interpolation='bicubic'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/efficientnet/metafile.yml b/configs/efficientnet/metafile.yml new file mode 100644 index 00000000000..cdb8f424f7f --- /dev/null +++ b/configs/efficientnet/metafile.yml @@ -0,0 +1,391 @@ +Collections: + - Name: EfficientNet + Metadata: + Training Data: ImageNet-1k + Architecture: + - 1x1 Convolution + - Average Pooling + - Convolution + - Dense Connections + - Dropout + - Inverted Residual Block + - RMSProp + - Squeeze-and-Excitation Block + - Swish + Paper: + URL: https://arxiv.org/abs/1905.11946v5 + Title: "EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks" + README: configs/efficientnet/README.md + Code: + Version: v0.20.0 + URL: https://github.com/open-mmlab/mmclassification/blob/v0.20.0/mmcls/models/backbones/efficientnet.py + +Models: + - Name: efficientnet-b0_3rdparty_8xb32_in1k + Metadata: + FLOPs: 16481180 + Parameters: 5288548 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 76.74 + Top 5 Accuracy: 93.17 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32_in1k_20220119-a7e2a0b1.pth + Config: configs/efficientnet/efficientnet-b0_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckpts/efficientnet-b0.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b0_3rdparty_8xb32-aa_in1k + Metadata: + FLOPs: 16481180 + Parameters: 5288548 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 77.26 + Top 5 Accuracy: 93.41 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa_in1k_20220119-8d939117.pth + Config: configs/efficientnet/efficientnet-b0_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckptsaug/efficientnet-b0.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b0_3rdparty_8xb32-aa-advprop_in1k + Metadata: + FLOPs: 16481180 + Parameters: 5288548 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 77.53 + Top 5 Accuracy: 93.61 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa-advprop_in1k_20220119-26434485.pth + Config: configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/advprop/efficientnet-b0.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b1_3rdparty_8xb32_in1k + Metadata: + FLOPs: 27052224 + Parameters: 7794184 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 78.68 + Top 5 Accuracy: 94.28 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32_in1k_20220119-002556d9.pth + Config: configs/efficientnet/efficientnet-b1_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckpts/efficientnet-b1.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b1_3rdparty_8xb32-aa_in1k + Metadata: + FLOPs: 27052224 + Parameters: 7794184 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 79.20 + Top 5 Accuracy: 94.42 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa_in1k_20220119-619d8ae3.pth + Config: configs/efficientnet/efficientnet-b1_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckptsaug/efficientnet-b1.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b1_3rdparty_8xb32-aa-advprop_in1k + Metadata: + FLOPs: 27052224 + Parameters: 7794184 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 79.52 + Top 5 Accuracy: 94.43 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa-advprop_in1k_20220119-5715267d.pth + Config: configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/advprop/efficientnet-b1.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b2_3rdparty_8xb32_in1k + Metadata: + FLOPs: 34346386 + Parameters: 9109994 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 79.64 + Top 5 Accuracy: 94.80 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32_in1k_20220119-ea374a30.pth + Config: configs/efficientnet/efficientnet-b2_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckpts/efficientnet-b2.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b2_3rdparty_8xb32-aa_in1k + Metadata: + FLOPs: 34346386 + Parameters: 9109994 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 80.21 + Top 5 Accuracy: 94.96 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa_in1k_20220119-dd61e80b.pth + Config: configs/efficientnet/efficientnet-b2_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckptsaug/efficientnet-b2.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b2_3rdparty_8xb32-aa-advprop_in1k + Metadata: + FLOPs: 34346386 + Parameters: 9109994 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 80.45 + Top 5 Accuracy: 95.07 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa-advprop_in1k_20220119-1655338a.pth + Config: configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/advprop/efficientnet-b2.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b3_3rdparty_8xb32_in1k + Metadata: + FLOPs: 58641904 + Parameters: 12233232 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 81.01 + Top 5 Accuracy: 95.34 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32_in1k_20220119-4b4d7487.pth + Config: configs/efficientnet/efficientnet-b3_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckpts/efficientnet-b3.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b3_3rdparty_8xb32-aa_in1k + Metadata: + FLOPs: 58641904 + Parameters: 12233232 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 81.58 + Top 5 Accuracy: 95.67 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa_in1k_20220119-5b4887a0.pth + Config: configs/efficientnet/efficientnet-b3_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckptsaug/efficientnet-b3.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b3_3rdparty_8xb32-aa-advprop_in1k + Metadata: + FLOPs: 58641904 + Parameters: 12233232 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 81.81 + Top 5 Accuracy: 95.69 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa-advprop_in1k_20220119-53b41118.pth + Config: configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/advprop/efficientnet-b3.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b4_3rdparty_8xb32_in1k + Metadata: + FLOPs: 121870624 + Parameters: 19341616 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 82.57 + Top 5 Accuracy: 96.09 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32_in1k_20220119-81fd4077.pth + Config: configs/efficientnet/efficientnet-b4_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckpts/efficientnet-b4.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b4_3rdparty_8xb32-aa_in1k + Metadata: + FLOPs: 121870624 + Parameters: 19341616 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 82.95 + Top 5 Accuracy: 96.26 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa_in1k_20220119-45b8bd2b.pth + Config: configs/efficientnet/efficientnet-b4_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckptsaug/efficientnet-b4.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b4_3rdparty_8xb32-aa-advprop_in1k + Metadata: + FLOPs: 121870624 + Parameters: 19341616 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.25 + Top 5 Accuracy: 96.44 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa-advprop_in1k_20220119-38c2238c.pth + Config: configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/advprop/efficientnet-b4.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b5_3rdparty_8xb32_in1k + Metadata: + FLOPs: 243879440 + Parameters: 30389784 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.18 + Top 5 Accuracy: 96.47 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32_in1k_20220119-e9814430.pth + Config: configs/efficientnet/efficientnet-b5_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckpts/efficientnet-b5.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b5_3rdparty_8xb32-aa_in1k + Metadata: + FLOPs: 243879440 + Parameters: 30389784 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.82 + Top 5 Accuracy: 96.76 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa_in1k_20220119-2cab8b78.pth + Config: configs/efficientnet/efficientnet-b5_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckptsaug/efficientnet-b5.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b5_3rdparty_8xb32-aa-advprop_in1k + Metadata: + FLOPs: 243879440 + Parameters: 30389784 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 84.21 + Top 5 Accuracy: 96.98 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa-advprop_in1k_20220119-f57a895a.pth + Config: configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/advprop/efficientnet-b5.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b6_3rdparty_8xb32-aa_in1k + Metadata: + FLOPs: 412002408 + Parameters: 43040704 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 84.05 + Top 5 Accuracy: 96.82 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa_in1k_20220119-45b03310.pth + Config: configs/efficientnet/efficientnet-b6_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckptsaug/efficientnet-b6.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b6_3rdparty_8xb32-aa-advprop_in1k + Metadata: + FLOPs: 412002408 + Parameters: 43040704 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 84.74 + Top 5 Accuracy: 97.14 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa-advprop_in1k_20220119-bfe3485e.pth + Config: configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/advprop/efficientnet-b6.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b7_3rdparty_8xb32-aa_in1k + Metadata: + FLOPs: 715526512 + Parameters: 66347960 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 84.38 + Top 5 Accuracy: 96.88 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa_in1k_20220119-bf03951c.pth + Config: configs/efficientnet/efficientnet-b7_8xb32_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/ckptsaug/efficientnet-b7.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b7_3rdparty_8xb32-aa-advprop_in1k + Metadata: + FLOPs: 715526512 + Parameters: 66347960 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 85.14 + Top 5 Accuracy: 97.23 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa-advprop_in1k_20220119-c6dbff10.pth + Config: configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/advprop/efficientnet-b7.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet + - Name: efficientnet-b8_3rdparty_8xb32-aa-advprop_in1k + Metadata: + FLOPs: 1092755326 + Parameters: 87413142 + In Collections: EfficientNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 85.38 + Top 5 Accuracy: 97.28 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b8_3rdparty_8xb32-aa-advprop_in1k_20220119-297ce1b7.pth + Config: configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py + Converted From: + Weights: https://storage.googleapis.com/cloud-tpu-checkpoints/efficientnet/advprop/efficientnet-b8.tar.gz + Code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet diff --git a/configs/hrnet/README.md b/configs/hrnet/README.md new file mode 100644 index 00000000000..e3144cc4a23 --- /dev/null +++ b/configs/hrnet/README.md @@ -0,0 +1,43 @@ +# HRNet + +> [Deep High-Resolution Representation Learning for Visual Recognition](https://arxiv.org/abs/1908.07919v2) + + +## Abstract + +High-resolution representations are essential for position-sensitive vision problems, such as human pose estimation, semantic segmentation, and object detection. Existing state-of-the-art frameworks first encode the input image as a low-resolution representation through a subnetwork that is formed by connecting high-to-low resolution convolutions *in series* (e.g., ResNet, VGGNet), and then recover the high-resolution representation from the encoded low-resolution representation. Instead, our proposed network, named as High-Resolution Network (HRNet), maintains high-resolution representations through the whole process. There are two key characteristics: (i) Connect the high-to-low resolution convolution streams *in parallel*; (ii) Repeatedly exchange the information across resolutions. The benefit is that the resulting representation is semantically richer and spatially more precise. We show the superiority of the proposed HRNet in a wide range of applications, including human pose estimation, semantic segmentation, and object detection, suggesting that the HRNet is a stronger backbone for computer vision problems. + +
+ +
+ +## Results and models + +## ImageNet-1k + +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +|:---------------------:|:---------:|:--------:|:---------:|:---------:|:------:|:--------:| +| HRNet-W18\* | 21.30 | 4.33 | 76.75 | 93.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w18_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32_in1k_20220120-0c10b180.pth) | +| HRNet-W30\* | 37.71 | 8.17 | 78.19 | 94.22 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w30_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w30_3rdparty_8xb32_in1k_20220120-8aa3832f.pth) | +| HRNet-W32\* | 41.23 | 8.99 | 78.44 | 94.19 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w32_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w32_3rdparty_8xb32_in1k_20220120-c394f1ab.pth) | +| HRNet-W40\* | 57.55 | 12.77 | 78.94 | 94.47 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w40_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w40_3rdparty_8xb32_in1k_20220120-9a2dbfc5.pth) | +| HRNet-W44\* | 67.06 | 14.96 | 78.88 | 94.37 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w44_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w44_3rdparty_8xb32_in1k_20220120-35d07f73.pth) | +| HRNet-W48\* | 77.47 | 17.36 | 79.32 | 94.52 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w48_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32_in1k_20220120-e555ef50.pth) | +| HRNet-W64\* | 128.06 | 29.00 | 79.46 | 94.65 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w64_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w64_3rdparty_8xb32_in1k_20220120-19126642.pth) | +| HRNet-W18 (ssld)\* | 21.30 | 4.33 | 81.06 | 95.70 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w18_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32-ssld_in1k_20220120-455f69ea.pth) | +| HRNet-W48 (ssld)\* | 77.47 | 17.36 | 83.63 | 96.79 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w48_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32-ssld_in1k_20220120-d0459c38.pth) | + +*Models with \* are converted from the [official repo](https://github.com/HRNet/HRNet-Image-Classification). The config files of these models are only for inference. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +## Citation + +``` +@article{WangSCJDZLMTWLX19, + title={Deep High-Resolution Representation Learning for Visual Recognition}, + author={Jingdong Wang and Ke Sun and Tianheng Cheng and + Borui Jiang and Chaorui Deng and Yang Zhao and Dong Liu and Yadong Mu and + Mingkui Tan and Xinggang Wang and Wenyu Liu and Bin Xiao}, + journal = {TPAMI} + year={2019} +} +``` diff --git a/configs/hrnet/hrnet-w18_4xb32_in1k.py b/configs/hrnet/hrnet-w18_4xb32_in1k.py new file mode 100644 index 00000000000..a84fe67fb6c --- /dev/null +++ b/configs/hrnet/hrnet-w18_4xb32_in1k.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/hrnet/hrnet-w18.py', + '../_base_/datasets/imagenet_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs256_coslr.py', + '../_base_/default_runtime.py' +] diff --git a/configs/hrnet/hrnet-w30_4xb32_in1k.py b/configs/hrnet/hrnet-w30_4xb32_in1k.py new file mode 100644 index 00000000000..d2a9c0ddbe3 --- /dev/null +++ b/configs/hrnet/hrnet-w30_4xb32_in1k.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/hrnet/hrnet-w30.py', + '../_base_/datasets/imagenet_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs256_coslr.py', + '../_base_/default_runtime.py' +] diff --git a/configs/hrnet/hrnet-w32_4xb32_in1k.py b/configs/hrnet/hrnet-w32_4xb32_in1k.py new file mode 100644 index 00000000000..91380a965b8 --- /dev/null +++ b/configs/hrnet/hrnet-w32_4xb32_in1k.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/hrnet/hrnet-w32.py', + '../_base_/datasets/imagenet_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs256_coslr.py', + '../_base_/default_runtime.py' +] diff --git a/configs/hrnet/hrnet-w40_4xb32_in1k.py b/configs/hrnet/hrnet-w40_4xb32_in1k.py new file mode 100644 index 00000000000..5d35cecd76f --- /dev/null +++ b/configs/hrnet/hrnet-w40_4xb32_in1k.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/hrnet/hrnet-w40.py', + '../_base_/datasets/imagenet_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs256_coslr.py', + '../_base_/default_runtime.py' +] diff --git a/configs/hrnet/hrnet-w44_4xb32_in1k.py b/configs/hrnet/hrnet-w44_4xb32_in1k.py new file mode 100644 index 00000000000..ce6bb41ac05 --- /dev/null +++ b/configs/hrnet/hrnet-w44_4xb32_in1k.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/hrnet/hrnet-w44.py', + '../_base_/datasets/imagenet_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs256_coslr.py', + '../_base_/default_runtime.py' +] diff --git a/configs/hrnet/hrnet-w48_4xb32_in1k.py b/configs/hrnet/hrnet-w48_4xb32_in1k.py new file mode 100644 index 00000000000..6943892e6d2 --- /dev/null +++ b/configs/hrnet/hrnet-w48_4xb32_in1k.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/hrnet/hrnet-w48.py', + '../_base_/datasets/imagenet_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs256_coslr.py', + '../_base_/default_runtime.py' +] diff --git a/configs/hrnet/hrnet-w64_4xb32_in1k.py b/configs/hrnet/hrnet-w64_4xb32_in1k.py new file mode 100644 index 00000000000..0009bc67b0c --- /dev/null +++ b/configs/hrnet/hrnet-w64_4xb32_in1k.py @@ -0,0 +1,6 @@ +_base_ = [ + '../_base_/models/hrnet/hrnet-w64.py', + '../_base_/datasets/imagenet_bs32_pil_resize.py', + '../_base_/schedules/imagenet_bs256_coslr.py', + '../_base_/default_runtime.py' +] diff --git a/configs/hrnet/metafile.yml b/configs/hrnet/metafile.yml new file mode 100644 index 00000000000..a95261481e5 --- /dev/null +++ b/configs/hrnet/metafile.yml @@ -0,0 +1,162 @@ +Collections: + - Name: HRNet + Metadata: + Training Data: ImageNet-1k + Architecture: + - Batch Normalization + - Convolution + - ReLU + - Residual Connection + Paper: + URL: https://arxiv.org/abs/1908.07919v2 + Title: "Deep High-Resolution Representation Learning for Visual Recognition" + README: configs/hrnet/README.md + Code: + URL: https://github.com/open-mmlab/mmclassification/blob/v0.20.0/mmcls/models/backbones/hrnet.py + Version: v0.20.0 + +Models: + - Name: hrnet-w18_3rdparty_8xb32_in1k + Metadata: + FLOPs: 4330397932 + Parameters: 21295164 + In Collection: HRNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 76.75 + Top 5 Accuracy: 93.44 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32_in1k_20220120-0c10b180.pth + Config: configs/hrnet/hrnet-w18_4xb32_in1k.py + Converted From: + Weights: https://1drv.ms/u/s!Aus8VCZ_C_33cMkPimlmClRvmpw + Code: https://github.com/HRNet/HRNet-Image-Classification + - Name: hrnet-w30_3rdparty_8xb32_in1k + Metadata: + FLOPs: 8168305684 + Parameters: 37708380 + In Collection: HRNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 78.19 + Top 5 Accuracy: 94.22 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w30_3rdparty_8xb32_in1k_20220120-8aa3832f.pth + Config: configs/hrnet/hrnet-w30_4xb32_in1k.py + Converted From: + Weights: https://1drv.ms/u/s!Aus8VCZ_C_33cQoACCEfrzcSaVI + Code: https://github.com/HRNet/HRNet-Image-Classification + - Name: hrnet-w32_3rdparty_8xb32_in1k + Metadata: + FLOPs: 8986267584 + Parameters: 41228840 + In Collection: HRNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 78.44 + Top 5 Accuracy: 94.19 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w32_3rdparty_8xb32_in1k_20220120-c394f1ab.pth + Config: configs/hrnet/hrnet-w32_4xb32_in1k.py + Converted From: + Weights: https://1drv.ms/u/s!Aus8VCZ_C_33dYBMemi9xOUFR0w + Code: https://github.com/HRNet/HRNet-Image-Classification + - Name: hrnet-w40_3rdparty_8xb32_in1k + Metadata: + FLOPs: 12767574064 + Parameters: 57553320 + In Collection: HRNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 78.94 + Top 5 Accuracy: 94.47 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w40_3rdparty_8xb32_in1k_20220120-9a2dbfc5.pth + Config: configs/hrnet/hrnet-w40_4xb32_in1k.py + Converted From: + Weights: https://1drv.ms/u/s!Aus8VCZ_C_33ck0gvo5jfoWBOPo + Code: https://github.com/HRNet/HRNet-Image-Classification + - Name: hrnet-w44_3rdparty_8xb32_in1k + Metadata: + FLOPs: 14963902632 + Parameters: 67061144 + In Collection: HRNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 78.88 + Top 5 Accuracy: 94.37 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w44_3rdparty_8xb32_in1k_20220120-35d07f73.pth + Config: configs/hrnet/hrnet-w44_4xb32_in1k.py + Converted From: + Weights: https://1drv.ms/u/s!Aus8VCZ_C_33czZQ0woUb980gRs + Code: https://github.com/HRNet/HRNet-Image-Classification + - Name: hrnet-w48_3rdparty_8xb32_in1k + Metadata: + FLOPs: 17364014752 + Parameters: 77466024 + In Collection: HRNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 79.32 + Top 5 Accuracy: 94.52 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32_in1k_20220120-e555ef50.pth + Config: configs/hrnet/hrnet-w48_4xb32_in1k.py + Converted From: + Weights: https://1drv.ms/u/s!Aus8VCZ_C_33dKvqI6pBZlifgJk + Code: https://github.com/HRNet/HRNet-Image-Classification + - Name: hrnet-w64_3rdparty_8xb32_in1k + Metadata: + FLOPs: 29002298752 + Parameters: 128056104 + In Collection: HRNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 79.46 + Top 5 Accuracy: 94.65 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w64_3rdparty_8xb32_in1k_20220120-19126642.pth + Config: configs/hrnet/hrnet-w64_4xb32_in1k.py + Converted From: + Weights: https://1drv.ms/u/s!Aus8VCZ_C_33gQbJsUPTIj3rQu99 + Code: https://github.com/HRNet/HRNet-Image-Classification + - Name: hrnet-w18_3rdparty_8xb32-ssld_in1k + Metadata: + FLOPs: 4330397932 + Parameters: 21295164 + In Collection: HRNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 81.06 + Top 5 Accuracy: 95.7 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32-ssld_in1k_20220120-455f69ea.pth + Config: configs/hrnet/hrnet-w18_4xb32_in1k.py + Converted From: + Weights: https://github.com/HRNet/HRNet-Image-Classification/releases/download/PretrainedWeights/HRNet_W18_C_ssld_pretrained.pth + Code: https://github.com/HRNet/HRNet-Image-Classification + - Name: hrnet-w48_3rdparty_8xb32-ssld_in1k + Metadata: + FLOPs: 17364014752 + Parameters: 77466024 + In Collection: HRNet + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.63 + Top 5 Accuracy: 96.79 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32-ssld_in1k_20220120-d0459c38.pth + Config: configs/hrnet/hrnet-w48_4xb32_in1k.py + Converted From: + Weights: https://github.com/HRNet/HRNet-Image-Classification/releases/download/PretrainedWeights/HRNet_W48_C_ssld_pretrained.pth + Code: https://github.com/HRNet/HRNet-Image-Classification diff --git a/configs/lenet/README.md b/configs/lenet/README.md index a19f0b8bb38..241bedab173 100644 --- a/configs/lenet/README.md +++ b/configs/lenet/README.md @@ -1,19 +1,19 @@ -# Backpropagation Applied to Handwritten Zip Code Recognition - +# LeNet + +> [Backpropagation Applied to Handwritten Zip Code Recognition](https://ieeexplore.ieee.org/document/6795724) ## Abstract - The ability of learning networks to generalize can be greatly enhanced by providing constraints from the task domain. This paper demonstrates how such constraints can be integrated into a backpropagation network through the architecture of the network. This approach has been successfully applied to the recognition of handwritten zip code digits provided by the U.S. Postal Service. A single network learns the entire recognition operation, going from the normalized image of the character to the final classification. -
## Citation -```latex + +``` @ARTICLE{6795724, author={Y. {LeCun} and B. {Boser} and J. S. {Denker} and D. {Henderson} and R. E. {Howard} and W. {Hubbard} and L. D. {Jackel}}, journal={Neural Computation}, diff --git a/configs/mlp_mixer/README.md b/configs/mlp_mixer/README.md index 17f7ec1540f..dc8866e8447 100644 --- a/configs/mlp_mixer/README.md +++ b/configs/mlp_mixer/README.md @@ -1,28 +1,16 @@ -# MLP-Mixer: An all-MLP Architecture for Vision - +# Mlp-Mixer + +> [MLP-Mixer: An all-MLP Architecture for Vision](https://arxiv.org/abs/2105.01601) ## Abstract - + Convolutional Neural Networks (CNNs) are the go-to model for computer vision. Recently, attention-based networks, such as the Vision Transformer, have also become popular. In this paper we show that while convolutions and attention are both sufficient for good performance, neither of them are necessary. We present MLP-Mixer, an architecture based exclusively on multi-layer perceptrons (MLPs). MLP-Mixer contains two types of layers: one with MLPs applied independently to image patches (i.e. "mixing" the per-location features), and one with MLPs applied across patches (i.e. "mixing" spatial information). When trained on large datasets, or with modern regularization schemes, MLP-Mixer attains competitive scores on image classification benchmarks, with pre-training and inference cost comparable to state-of-the-art models. We hope that these results spark further research beyond the realms of well established CNNs and Transformers. -
-## Citation -```latex -@misc{tolstikhin2021mlpmixer, - title={MLP-Mixer: An all-MLP Architecture for Vision}, - author={Ilya Tolstikhin and Neil Houlsby and Alexander Kolesnikov and Lucas Beyer and Xiaohua Zhai and Thomas Unterthiner and Jessica Yung and Andreas Steiner and Daniel Keysers and Jakob Uszkoreit and Mario Lucic and Alexey Dosovitskiy}, - year={2021}, - eprint={2105.01601}, - archivePrefix={arXiv}, - primaryClass={cs.CV} -} -``` - ## Results and models ### ImageNet-1k @@ -33,3 +21,16 @@ Convolutional Neural Networks (CNNs) are the go-to model for computer vision. Re | Mixer-L/16\* | 208.2 | 44.57 | 72.34 | 88.02 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mlp_mixer/mlp-mixer-large-p16_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mlp-mixer/mixer-large-p16_3rdparty_64xb64_in1k_20211124-5a2519d2.pth) | *Models with \* are converted from [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/mlp_mixer.py). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +## Citation + +``` +@misc{tolstikhin2021mlpmixer, + title={MLP-Mixer: An all-MLP Architecture for Vision}, + author={Ilya Tolstikhin and Neil Houlsby and Alexander Kolesnikov and Lucas Beyer and Xiaohua Zhai and Thomas Unterthiner and Jessica Yung and Andreas Steiner and Daniel Keysers and Jakob Uszkoreit and Mario Lucic and Alexey Dosovitskiy}, + year={2021}, + eprint={2105.01601}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/mlp_mixer/metafile.yml b/configs/mlp_mixer/metafile.yml index 7c709d87f16..e8efa085013 100644 --- a/configs/mlp_mixer/metafile.yml +++ b/configs/mlp_mixer/metafile.yml @@ -10,9 +10,9 @@ Collections: URL: https://arxiv.org/abs/2105.01601 Title: "MLP-Mixer: An all-MLP Architecture for Vision" README: configs/mlp_mixer/README.md -# Code: -# URL: # todo -# Version: # todo + Code: + URL: https://github.com/open-mmlab/mmclassification/blob/v0.18.0/mmcls/models/backbones/mlp_mixer.py + Version: v0.18.0 Models: - Name: mlp-mixer-base-p16_3rdparty_64xb64_in1k diff --git a/configs/mobilenet_v2/README.md b/configs/mobilenet_v2/README.md index 6f22123f5fa..9a0cd8a6549 100644 --- a/configs/mobilenet_v2/README.md +++ b/configs/mobilenet_v2/README.md @@ -1,20 +1,29 @@ -# MobileNetV2: Inverted Residuals and Linear Bottlenecks - +# MobileNet V2 + +> [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) ## Abstract - + In this paper we describe a new mobile architecture, MobileNetV2, that improves the state of the art performance of mobile models on multiple tasks and benchmarks as well as across a spectrum of different model sizes. We also describe efficient ways of applying these mobile models to object detection in a novel framework we call SSDLite. Additionally, we demonstrate how to build mobile semantic segmentation models through a reduced form of DeepLabv3 which we call Mobile DeepLabv3. The MobileNetV2 architecture is based on an inverted residual structure where the input and output of the residual block are thin bottleneck layers opposite to traditional residual models which use expanded representations in the input an MobileNetV2 uses lightweight depthwise convolutions to filter features in the intermediate expansion layer. Additionally, we find that it is important to remove non-linearities in the narrow layers in order to maintain representational power. We demonstrate that this improves performance and provide an intuition that led to this design. Finally, our approach allows decoupling of the input/output domains from the expressiveness of the transformation, which provides a convenient framework for further analysis. We measure our performance on Imagenet classification, COCO object detection, VOC image segmentation. We evaluate the trade-offs between accuracy, and number of operations measured by multiply-adds (MAdd), as well as the number of parameters -
+## Results and models + +### ImageNet-1k + +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +|:---------------------:|:---------:|:--------:|:---------:|:---------:|:---------:|:--------:| +| MobileNet V2 | 3.5 | 0.319 | 71.86 | 90.42 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth) | [log](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.log.json) | + ## Citation -```latex + +``` @INPROCEEDINGS{8578572, author={M. {Sandler} and A. {Howard} and M. {Zhu} and A. {Zhmoginov} and L. {Chen}}, booktitle={2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition}, @@ -26,11 +35,3 @@ The MobileNetV2 architecture is based on an inverted residual structure where th doi={10.1109/CVPR.2018.00474}} } ``` - -## Results and models - -### ImageNet-1k - -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -|:---------------------:|:---------:|:--------:|:---------:|:---------:|:---------:|:--------:| -| MobileNet V2 | 3.5 | 0.319 | 71.86 | 90.42 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mobilenet_v2/mobilenet-v2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth) | [log](https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.log.json) | diff --git a/configs/mobilenet_v3/README.md b/configs/mobilenet_v3/README.md index b7d0cf142bf..36392b91c3b 100644 --- a/configs/mobilenet_v3/README.md +++ b/configs/mobilenet_v3/README.md @@ -1,27 +1,16 @@ -# Searching for MobileNetV3 - +# MobileNet V3 + +> [Searching for MobileNetV3](https://arxiv.org/abs/1905.02244) ## Abstract - + We present the next generation of MobileNets based on a combination of complementary search techniques as well as a novel architecture design. MobileNetV3 is tuned to mobile phone CPUs through a combination of hardware-aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances. This paper starts the exploration of how automated search algorithms and network design can work together to harness complementary approaches improving the overall state of the art. Through this process we create two new MobileNet models for release: MobileNetV3-Large and MobileNetV3-Small which are targeted for high and low resource use cases. These models are then adapted and applied to the tasks of object detection and semantic segmentation. For the task of semantic segmentation (or any dense pixel prediction), we propose a new efficient segmentation decoder Lite Reduced Atrous Spatial Pyramid Pooling (LR-ASPP). We achieve new state of the art results for mobile classification, detection and segmentation. MobileNetV3-Large is 3.2\% more accurate on ImageNet classification while reducing latency by 15\% compared to MobileNetV2. MobileNetV3-Small is 4.6\% more accurate while reducing latency by 5\% compared to MobileNetV2. MobileNetV3-Large detection is 25\% faster at roughly the same accuracy as MobileNetV2 on COCO detection. MobileNetV3-Large LR-ASPP is 30\% faster than MobileNetV2 R-ASPP at similar accuracy for Cityscapes segmentation. -
-## Citation -```latex -@inproceedings{Howard_2019_ICCV, - author = {Howard, Andrew and Sandler, Mark and Chu, Grace and Chen, Liang-Chieh and Chen, Bo and Tan, Mingxing and Wang, Weijun and Zhu, Yukun and Pang, Ruoming and Vasudevan, Vijay and Le, Quoc V. and Adam, Hartwig}, - title = {Searching for MobileNetV3}, - booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, - month = {October}, - year = {2019} -} -``` - ## Results and models ### ImageNet-1k @@ -32,3 +21,15 @@ We present the next generation of MobileNets based on a combination of complemen | MobileNetV3-Large\* | 5.48 | 0.23 | 74.04 | 91.34 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/mobilenet_v3/mobilenet-v3-large_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth) | *Models with \* are converted from [torchvision](https://pytorch.org/vision/stable/_modules/torchvision/models/mobilenetv3.html). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +## Citation + +``` +@inproceedings{Howard_2019_ICCV, + author = {Howard, Andrew and Sandler, Mark and Chu, Grace and Chen, Liang-Chieh and Chen, Bo and Tan, Mingxing and Wang, Weijun and Zhu, Yukun and Pang, Ruoming and Vasudevan, Vijay and Le, Quoc V. and Adam, Hartwig}, + title = {Searching for MobileNetV3}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + month = {October}, + year = {2019} +} +``` diff --git a/configs/mobilenet_v3/metafile.yml b/configs/mobilenet_v3/metafile.yml index d0197d03c22..09c4732e648 100644 --- a/configs/mobilenet_v3/metafile.yml +++ b/configs/mobilenet_v3/metafile.yml @@ -10,8 +10,13 @@ Collections: Batch Size: 1024 Architecture: - MobileNet V3 - Paper: https://arxiv.org/abs/1905.02244 + Paper: + URL: https://arxiv.org/abs/1905.02244 + Title: Searching for MobileNetV3 README: configs/mobilenet_v3/README.md + Code: + URL: https://github.com/open-mmlab/mmclassification/blob/v0.15.0/mmcls/models/backbones/mobilenet_v3.py + Version: v0.15.0 Models: - Name: mobilenet_v3_small_imagenet diff --git a/configs/regnet/README.md b/configs/regnet/README.md index 48255efe1d5..b7bbc3f0904 100644 --- a/configs/regnet/README.md +++ b/configs/regnet/README.md @@ -1,29 +1,16 @@ -# Designing Network Design Spaces - +# RegNet + +> [Designing Network Design Spaces](https://arxiv.org/abs/2003.13678) ## Abstract - + In this work, we present a new network design paradigm. Our goal is to help advance the understanding of network design and discover design principles that generalize across settings. Instead of focusing on designing individual network instances, we design network design spaces that parametrize populations of networks. The overall process is analogous to classic manual design of networks, but elevated to the design space level. Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs. -
-## Citation - -```latex -@article{radosavovic2020designing, - title={Designing Network Design Spaces}, - author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár}, - year={2020}, - eprint={2003.13678}, - archivePrefix={arXiv}, - primaryClass={cs.CV} -} -``` - ## Results and models ### ImageNet-1k @@ -48,3 +35,16 @@ In this work, we present a new network design paradigm. Our goal is to help adva | RegNetX-12GF\* | 46.11 | 12.15 | 79.91 | 94.78 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/regnet/regnetx-12gf_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/regnet/convert/RegNetX-12GF-0574538f.pth) | *Models with \* are converted from [pycls](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md). The config files of these models are only for validation.* + +## Citation + +``` +@article{radosavovic2020designing, + title={Designing Network Design Spaces}, + author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár}, + year={2020}, + eprint={2003.13678}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` diff --git a/configs/repvgg/README.md b/configs/repvgg/README.md index 997e8554241..e0aaeed49cb 100644 --- a/configs/repvgg/README.md +++ b/configs/repvgg/README.md @@ -1,25 +1,30 @@ -# Repvgg: Making vgg-style convnets great again - +# RepVGG + +> [Repvgg: Making vgg-style convnets great again](https://arxiv.org/abs/2101.03697) ## Abstract - + We present a simple but powerful architecture of convolutional neural network, which has a VGG-like inference-time body composed of nothing but a stack of 3x3 convolution and ReLU, while the training-time model has a multi-branch topology. Such decoupling of the training-time and inference-time architecture is realized by a structural re-parameterization technique so that the model is named RepVGG. On ImageNet, RepVGG reaches over 80% top-1 accuracy, which is the first time for a plain model, to the best of our knowledge. On NVIDIA 1080Ti GPU, RepVGG models run 83% faster than ResNet-50 or 101% faster than ResNet-101 with higher accuracy and show favorable accuracy-speed trade-off compared to the state-of-the-art models like EfficientNet and RegNet. -
-## Citation -```latex -@inproceedings{ding2021repvgg, - title={Repvgg: Making vgg-style convnets great again}, - author={Ding, Xiaohan and Zhang, Xiangyu and Ma, Ningning and Han, Jungong and Ding, Guiguang and Sun, Jian}, - booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, - pages={13733--13742}, - year={2021} -} +## Introduction + +The checkpoints provided are all `training-time` models. Use the reparameterize tool to switch them to more efficient `inference-time` architecture, which not only has fewer parameters but also less calculations. + +```bash +python tools/convert_models/reparameterize_repvgg.py ${CFG_PATH} ${SRC_CKPT_PATH} ${TARGET_CKPT_PATH} +``` + +`${CFG_PATH}` is the config file, `${SRC_CKPT_PATH}` is the source chenpoint file, `${TARGET_CKPT_PATH}` is the target deploy weight file path. + +To use reparameterized repvgg weight, the config file must switch to [the deploy config files](./deploy) as below: + +```bash +python tools/test.py ${RapVGG_Deploy_CFG} ${CHECK_POINT} ``` ## Results and models @@ -43,18 +48,14 @@ We present a simple but powerful architecture of convolutional neural network, w *Models with \* are converted from the [official repo](https://github.com/DingXiaoH/RepVGG). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* -## Reparameterize RepVGG - -The checkpoints provided are all in `train` form. Use the reparameterize tool to switch them to more efficient `deploy` form, which not only has fewer parameters but also less calculations. +## Citation -```bash -python ./tools/convert_models/reparameterize_repvgg.py ${CFG_PATH} ${SRC_CKPT_PATH} ${TARGET_CKPT_PATH} ``` - -`${CFG_PATH}` is the config file, `${SRC_CKPT_PATH}` is the source chenpoint file, `${TARGET_CKPT_PATH}` is the target deploy weight file path. - -To use reparameterized repvgg weight, the config file must switch to [the deploy config files](./deploy) as below: - -```bash -python ./tools/test.py ${RapVGG_Deploy_CFG} ${CHECK_POINT} +@inproceedings{ding2021repvgg, + title={Repvgg: Making vgg-style convnets great again}, + author={Ding, Xiaohan and Zhang, Xiangyu and Ma, Ningning and Han, Jungong and Ding, Guiguang and Sun, Jian}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={13733--13742}, + year={2021} +} ``` diff --git a/configs/res2net/README.md b/configs/res2net/README.md index db6bc3cbb70..c7cebaa7379 100644 --- a/configs/res2net/README.md +++ b/configs/res2net/README.md @@ -1,27 +1,16 @@ -# Res2Net: A New Multi-scale Backbone Architecture - +# Res2Net + +> [Res2Net: A New Multi-scale Backbone Architecture](https://arxiv.org/pdf/1904.01169.pdf) ## Abstract - + Representing features at multiple scales is of great importance for numerous vision tasks. Recent advances in backbone convolutional neural networks (CNNs) continually demonstrate stronger multi-scale representation ability, leading to consistent performance gains on a wide range of applications. However, most existing methods represent the multi-scale features in a layer-wise manner. In this paper, we propose a novel building block for CNNs, namely Res2Net, by constructing hierarchical residual-like connections within one single residual block. The Res2Net represents multi-scale features at a granular level and increases the range of receptive fields for each network layer. The proposed Res2Net block can be plugged into the state-of-the-art backbone CNN models, e.g., ResNet, ResNeXt, and DLA. We evaluate the Res2Net block on all these models and demonstrate consistent performance gains over baseline models on widely-used datasets, e.g., CIFAR-100 and ImageNet. Further ablation studies and experimental results on representative computer vision tasks, i.e., object detection, class activation mapping, and salient object detection, further verify the superiority of the Res2Net over the state-of-the-art baseline methods. -
-## Citation -```latex -@article{gao2019res2net, - title={Res2Net: A New Multi-scale Backbone Architecture}, - author={Gao, Shang-Hua and Cheng, Ming-Ming and Zhao, Kai and Zhang, Xin-Yu and Yang, Ming-Hsuan and Torr, Philip}, - journal={IEEE TPAMI}, - year={2021}, - doi={10.1109/TPAMI.2019.2938758}, -} -``` - ## Results and models ### ImageNet-1k @@ -33,3 +22,15 @@ Representing features at multiple scales is of great importance for numerous vis | Res2Net-101-26w-4s\* | 224x224 | 45.21 | 8.12 | 79.19 | 94.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/res2net/res2net101-w26-s4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/res2net/res2net101-w26-s4_3rdparty_8xb32_in1k_20210927-870b6c36.pth) | [log]()| *Models with \* are converted from the [official repo](https://github.com/Res2Net/Res2Net-PretrainedModels). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +## Citation + +``` +@article{gao2019res2net, + title={Res2Net: A New Multi-scale Backbone Architecture}, + author={Gao, Shang-Hua and Cheng, Ming-Ming and Zhao, Kai and Zhang, Xin-Yu and Yang, Ming-Hsuan and Torr, Philip}, + journal={IEEE TPAMI}, + year={2021}, + doi={10.1109/TPAMI.2019.2938758}, +} +``` diff --git a/configs/resnest/README.md b/configs/resnest/README.md index d02ef885f58..55c0e7f2843 100644 --- a/configs/resnest/README.md +++ b/configs/resnest/README.md @@ -1,19 +1,19 @@ -# ResNeSt: Split-Attention Networks - +# ResNeSt + +> [ResNeSt: Split-Attention Networks](https://arxiv.org/abs/2004.08955) ## Abstract - + It is well known that featuremap attention and multi-path representation are important for visual recognition. In this paper, we present a modularized architecture, which applies the channel-wise attention on different network branches to leverage their success in capturing cross-feature interactions and learning diverse representations. Our design results in a simple and unified computation block, which can be parameterized using only a few variables. Our model, named ResNeSt, outperforms EfficientNet in accuracy and latency trade-off on image classification. In addition, ResNeSt has achieved superior transfer learning results on several public benchmarks serving as the backbone, and has been adopted by the winning entries of COCO-LVIS challenge. The source code for complete system and pretrained models are publicly available. -
- ## Citation -```latex + +``` @misc{zhang2020resnest, title={ResNeSt: Split-Attention Networks}, author={Hang Zhang and Chongruo Wu and Zhongyue Zhang and Yi Zhu and Haibin Lin and Zhi Zhang and Yue Sun and Tong He and Jonas Mueller and R. Manmatha and Mu Li and Alexander Smola}, diff --git a/configs/resnet/README.md b/configs/resnet/README.md index 02467b3aabf..9811c00f9c6 100644 --- a/configs/resnet/README.md +++ b/configs/resnet/README.md @@ -1,32 +1,21 @@ -# Deep Residual Learning for Image Recognition - +# ResNet + +> [Deep Residual Learning for Image Recognition](https://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html) ## Abstract - + Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation. -
-## Citation -```latex -@inproceedings{he2016deep, - title={Deep residual learning for image recognition}, - author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={770--778}, - year={2016} -} -``` - ## Results and models -## Cifar10 +### Cifar10 | Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | |:---------------------:|:---------:|:--------:|:---------:|:---------:|:---------:|:--------:| @@ -36,7 +25,7 @@ The depth of representations is of central importance for many visual recognitio | ResNet-101-b16x8 | 42.51 | 2.52 | 95.58 | 99.87 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet101_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_b16x8_cifar10_20210528-2d29e936.pth) | [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_b16x8_cifar10_20210528-2d29e936.log.json) | | ResNet-152-b16x8 | 58.16 | 3.74 | 95.76 | 99.89 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet152_8xb16_cifar10.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_b16x8_cifar10_20210528-3e8e9178.pth) | [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_b16x8_cifar10_20210528-3e8e9178.log.json) | -## Cifar100 +### Cifar100 | Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | |:---------------------:|:---------:|:--------:|:---------:|:---------:|:---------:|:--------:| @@ -60,3 +49,15 @@ The depth of representations is of central importance for many visual recognitio | ResNet-50 (rsb-a3) | 25.56 | 4.12 | 78.30 | 93.80 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnet/resnet50_8xb256-rsb-a3-100e_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a3-100e_in1k_20211228-3493673c.pth) | [log](https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a3-100e_in1k_20211228-3493673c.log.json) | *The "rsb" means using the training settings from [ResNet strikes back: An improved training procedure in timm](https://arxiv.org/abs/2110.00476).* + +## Citation + +``` +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} +``` diff --git a/configs/resnet/metafile.yml b/configs/resnet/metafile.yml index 20a23e89e69..719b5d4e21e 100644 --- a/configs/resnet/metafile.yml +++ b/configs/resnet/metafile.yml @@ -17,31 +17,16 @@ Collections: Code: URL: https://github.com/open-mmlab/mmclassification/blob/v0.15.0/mmcls/models/backbones/resnet.py#L383 Version: v0.15.0 - - Name: ResNet-CIFAR - Metadata: - Training Data: CIFAR-10 - Training Techniques: - - SGD with Momentum - - Weight Decay - Training Resources: 8x 1080 GPUs - Epochs: 200 - Batch Size: 128 - Architecture: - - ResNet - Paper: - URL: https://openaccess.thecvf.com/content_cvpr_2016/html/He_Deep_Residual_Learning_CVPR_2016_paper.html - Title: "Deep Residual Learning for Image Recognition" - README: configs/resnet/README.md - Code: - URL: https://github.com/open-mmlab/mmclassification/blob/v0.15.0/mmcls/models/backbones/resnet_cifar.py#L10 - Version: v0.15.0 Models: - Name: resnet18_8xb16_cifar10 Metadata: + Training Data: CIFAR-10 + Epochs: 200 + Batch Size: 128 FLOPs: 560000000 Parameters: 11170000 - In Collection: ResNet-CIFAR + In Collection: ResNet Results: - Dataset: CIFAR-10 Metrics: @@ -51,9 +36,12 @@ Models: Config: configs/resnet/resnet18_8xb16_cifar10.py - Name: resnet34_8xb16_cifar10 Metadata: + Training Data: CIFAR-10 + Epochs: 200 + Batch Size: 128 FLOPs: 1160000000 Parameters: 21280000 - In Collection: ResNet-CIFAR + In Collection: ResNet Results: - Dataset: CIFAR-10 Metrics: @@ -63,9 +51,12 @@ Models: Config: configs/resnet/resnet34_8xb16_cifar10.py - Name: resnet50_8xb16_cifar10 Metadata: + Training Data: CIFAR-10 + Epochs: 200 + Batch Size: 128 FLOPs: 1310000000 Parameters: 23520000 - In Collection: ResNet-CIFAR + In Collection: ResNet Results: - Dataset: CIFAR-10 Metrics: @@ -75,9 +66,12 @@ Models: Config: configs/resnet/resnet50_8xb16_cifar10.py - Name: resnet101_8xb16_cifar10 Metadata: + Training Data: CIFAR-10 + Epochs: 200 + Batch Size: 128 FLOPs: 2520000000 Parameters: 42510000 - In Collection: ResNet-CIFAR + In Collection: ResNet Results: - Dataset: CIFAR-10 Metrics: @@ -87,9 +81,12 @@ Models: Config: configs/resnet/resnet101_8xb16_cifar10.py - Name: resnet152_8xb16_cifar10 Metadata: + Training Data: CIFAR-10 + Epochs: 200 + Batch Size: 128 FLOPs: 3740000000 Parameters: 58160000 - In Collection: ResNet-CIFAR + In Collection: ResNet Results: - Dataset: CIFAR-10 Metrics: @@ -99,10 +96,12 @@ Models: Config: configs/resnet/resnet152_8xb16_cifar10.py - Name: resnet50_8xb16_cifar100 Metadata: + Training Data: CIFAR-100 + Epochs: 200 + Batch Size: 128 FLOPs: 1310000000 Parameters: 23710000 - Training Data: CIFAR-100 - In Collection: ResNet-CIFAR + In Collection: ResNet Results: - Dataset: CIFAR-100 Metrics: diff --git a/configs/resnext/README.md b/configs/resnext/README.md index 6cea78c5bec..2d0b51527ba 100644 --- a/configs/resnext/README.md +++ b/configs/resnext/README.md @@ -1,27 +1,16 @@ -# Aggregated Residual Transformations for Deep Neural Networks - +# ResNeXt + +> [Aggregated Residual Transformations for Deep Neural Networks](https://openaccess.thecvf.com/content_cvpr_2017/html/Xie_Aggregated_Residual_Transformations_CVPR_2017_paper.html) ## Abstract - + We present a simple, highly modularized network architecture for image classification. Our network is constructed by repeating a building block that aggregates a set of transformations with the same topology. Our simple design results in a homogeneous, multi-branch architecture that has only a few hyper-parameters to set. This strategy exposes a new dimension, which we call "cardinality" (the size of the set of transformations), as an essential factor in addition to the dimensions of depth and width. On the ImageNet-1K dataset, we empirically show that even under the restricted condition of maintaining complexity, increasing cardinality is able to improve classification accuracy. Moreover, increasing cardinality is more effective than going deeper or wider when we increase the capacity. Our models, named ResNeXt, are the foundations of our entry to the ILSVRC 2016 classification task in which we secured 2nd place. We further investigate ResNeXt on an ImageNet-5K set and the COCO detection set, also showing better results than its ResNet counterpart. The code and models are publicly available online. -
-## Citation -```latex -@inproceedings{xie2017aggregated, - title={Aggregated residual transformations for deep neural networks}, - author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming}, - booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, - pages={1492--1500}, - year={2017} -} -``` - ## Results and models ### ImageNet-1k @@ -32,3 +21,15 @@ We present a simple, highly modularized network architecture for image classific | ResNeXt-32x4d-101 | 44.18 | 8.03 | 78.61 | 94.17 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnext/resnext101-32x4d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth) | [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.log.json) | | ResNeXt-32x8d-101 | 88.79 | 16.5 | 79.27 | 94.58 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnext/resnext101-32x8d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth) | [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.log.json) | | ResNeXt-32x4d-152 | 59.95 | 11.8 | 78.88 | 94.33 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/resnext/resnext152-32x4d_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth) | [log](https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.log.json) | + +## Citation + +``` +@inproceedings{xie2017aggregated, + title={Aggregated residual transformations for deep neural networks}, + author={Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={1492--1500}, + year={2017} +} +``` diff --git a/configs/seresnet/README.md b/configs/seresnet/README.md index 5c5954ad83b..345bd63f6c6 100644 --- a/configs/seresnet/README.md +++ b/configs/seresnet/README.md @@ -1,18 +1,28 @@ -# Squeeze-and-Excitation Networks - +# SE-ResNet + +> [Squeeze-and-Excitation Networks](https://openaccess.thecvf.com/content_cvpr_2018/html/Hu_Squeeze-and-Excitation_Networks_CVPR_2018_paper.html) ## Abstract - + The central building block of convolutional neural networks (CNNs) is the convolution operator, which enables networks to construct informative features by fusing both spatial and channel-wise information within local receptive fields at each layer. A broad range of prior research has investigated the spatial component of this relationship, seeking to strengthen the representational power of a CNN by enhancing the quality of spatial encodings throughout its feature hierarchy. In this work, we focus instead on the channel relationship and propose a novel architectural unit, which we term the "Squeeze-and-Excitation" (SE) block, that adaptively recalibrates channel-wise feature responses by explicitly modelling interdependencies between channels. We show that these blocks can be stacked together to form SENet architectures that generalise extremely effectively across different datasets. We further demonstrate that SE blocks bring significant improvements in performance for existing state-of-the-art CNNs at slight additional computational cost. Squeeze-and-Excitation Networks formed the foundation of our ILSVRC 2017 classification submission which won first place and reduced the top-5 error to 2.251%, surpassing the winning entry of 2016 by a relative improvement of ~25%. -
+## Results and models + +### ImageNet-1k + +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +|:---------------------:|:---------:|:--------:|:---------:|:---------:|:---------:|:--------:| +| SE-ResNet-50 | 28.09 | 4.13 | 77.74 | 93.84 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/seresnet/seresnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth) | [log](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200708-657b3c36.log.json) | +| SE-ResNet-101 | 49.33 | 7.86 | 78.26 | 94.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/seresnet/seresnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth) | [log](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200708-038a4d04.log.json) | + ## Citation -```latex + +``` @inproceedings{hu2018squeeze, title={Squeeze-and-excitation networks}, author={Hu, Jie and Shen, Li and Sun, Gang}, @@ -21,12 +31,3 @@ The central building block of convolutional neural networks (CNNs) is the convol year={2018} } ``` - -## Results and models - -### ImageNet-1k - -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -|:---------------------:|:---------:|:--------:|:---------:|:---------:|:---------:|:--------:| -| SE-ResNet-50 | 28.09 | 4.13 | 77.74 | 93.84 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/seresnet/seresnet50_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth) | [log](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200708-657b3c36.log.json) | -| SE-ResNet-101 | 49.33 | 7.86 | 78.26 | 94.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/seresnet/seresnet101_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth) | [log](https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200708-038a4d04.log.json) | diff --git a/configs/shufflenet_v1/README.md b/configs/shufflenet_v1/README.md index 85adde68b58..34b33e43d10 100644 --- a/configs/shufflenet_v1/README.md +++ b/configs/shufflenet_v1/README.md @@ -1,18 +1,27 @@ -# ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices - +# ShuffleNet V1 + +> [ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices](https://openaccess.thecvf.com/content_cvpr_2018/html/Zhang_ShuffleNet_An_Extremely_CVPR_2018_paper.html) ## Abstract - + We introduce an extremely computation-efficient CNN architecture named ShuffleNet, which is designed specially for mobile devices with very limited computing power (e.g., 10-150 MFLOPs). The new architecture utilizes two new operations, pointwise group convolution and channel shuffle, to greatly reduce computation cost while maintaining accuracy. Experiments on ImageNet classification and MS COCO object detection demonstrate the superior performance of ShuffleNet over other structures, e.g. lower top-1 error (absolute 7.8%) than recent MobileNet on ImageNet classification task, under the computation budget of 40 MFLOPs. On an ARM-based mobile device, ShuffleNet achieves ~13x actual speedup over AlexNet while maintaining comparable accuracy. -
+## Results and models + +### ImageNet-1k + +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +|:---------------------:|:---------:|:--------:|:---------:|:---------:|:---------:|:--------:| +| ShuffleNetV1 1.0x (group=3) | 1.87 | 0.146 | 68.13 | 87.81 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/shufflenet_v1/shufflenet-v1-1x_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth) | [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.log.json) | + ## Citation -```latex + +``` @inproceedings{zhang2018shufflenet, title={Shufflenet: An extremely efficient convolutional neural network for mobile devices}, author={Zhang, Xiangyu and Zhou, Xinyu and Lin, Mengxiao and Sun, Jian}, @@ -21,11 +30,3 @@ We introduce an extremely computation-efficient CNN architecture named ShuffleNe year={2018} } ``` - -## Results and models - -### ImageNet-1k - -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -|:---------------------:|:---------:|:--------:|:---------:|:---------:|:---------:|:--------:| -| ShuffleNetV1 1.0x (group=3) | 1.87 | 0.146 | 68.13 | 87.81 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/shufflenet_v1/shufflenet-v1-1x_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth) | [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.log.json) | diff --git a/configs/shufflenet_v2/README.md b/configs/shufflenet_v2/README.md index dead2d3d8df..8da3d186908 100644 --- a/configs/shufflenet_v2/README.md +++ b/configs/shufflenet_v2/README.md @@ -1,18 +1,27 @@ -# Shufflenet v2: Practical guidelines for efficient cnn architecture design - +# ShuffleNet V2 + +> [Shufflenet v2: Practical guidelines for efficient cnn architecture design](https://openaccess.thecvf.com/content_ECCV_2018/papers/Ningning_Light-weight_CNN_Architecture_ECCV_2018_paper.pdf) ## Abstract - + Currently, the neural network architecture design is mostly guided by the *indirect* metric of computation complexity, i.e., FLOPs. However, the *direct* metric, e.g., speed, also depends on the other factors such as memory access cost and platform characterics. Thus, this work proposes to evaluate the direct metric on the target platform, beyond only considering FLOPs. Based on a series of controlled experiments, this work derives several practical *guidelines* for efficient network design. Accordingly, a new architecture is presented, called *ShuffleNet V2*. Comprehensive ablation experiments verify that our model is the state-of-the-art in terms of speed and accuracy tradeoff. -
+## Results and models + +### ImageNet-1k + +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +|:---------------------:|:---------:|:--------:|:---------:|:---------:|:---------:|:--------:| +| ShuffleNetV2 1.0x | 2.28 | 0.149 | 69.55 | 88.92 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth) | [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200804-8860eec9.log.json) | + ## Citation -```latex + +``` @inproceedings{ma2018shufflenet, title={Shufflenet v2: Practical guidelines for efficient cnn architecture design}, author={Ma, Ningning and Zhang, Xiangyu and Zheng, Hai-Tao and Sun, Jian}, @@ -21,11 +30,3 @@ Currently, the neural network architecture design is mostly guided by the *indir year={2018} } ``` - -## Results and models - -### ImageNet-1k - -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -|:---------------------:|:---------:|:--------:|:---------:|:---------:|:---------:|:--------:| -| ShuffleNetV2 1.0x | 2.28 | 0.149 | 69.55 | 88.92 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/shufflenet_v2/shufflenet-v2-1x_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth) | [log](https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200804-8860eec9.log.json) | diff --git a/configs/swin_transformer/README.md b/configs/swin_transformer/README.md index be3c787cf2f..1e405aa4941 100644 --- a/configs/swin_transformer/README.md +++ b/configs/swin_transformer/README.md @@ -1,27 +1,16 @@ -# Swin Transformer: Hierarchical Vision Transformer using Shifted Windows - +# Swin Transformer + +> [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/pdf/2103.14030.pdf) ## Abstract - + This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. To address these differences, we propose a hierarchical Transformer whose representation is computed with **S**hifted **win**dows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation (53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and +2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures. -
-## Citation - -```latex -@article{liu2021Swin, - title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows}, - author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining}, - journal={arXiv preprint arXiv:2103.14030}, - year={2021} -} -``` - ## Results and models ### ImageNet-21k @@ -51,3 +40,14 @@ The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don' | Swin-L\* | ImageNet-21k | 384x384 | 196.74 | 100.04 | 87.25 | 98.25 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/swin_transformer/swin-large_16xb64_in1k-384px.py) | [model](https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_large_patch4_window12_384_22kto1k-0a40944b.pth)| *Models with \* are converted from the [official repo](https://github.com/microsoft/Swin-Transformer#main-results-on-imagenet-with-pretrained-models). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +## Citation + +``` +@article{liu2021Swin, + title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows}, + author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining}, + journal={arXiv preprint arXiv:2103.14030}, + year={2021} +} +``` diff --git a/configs/t2t_vit/README.md b/configs/t2t_vit/README.md index 8e198d30401..b971a3057b4 100644 --- a/configs/t2t_vit/README.md +++ b/configs/t2t_vit/README.md @@ -1,26 +1,16 @@ -# Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet - +# Tokens-to-Token ViT + +> [Tokens-to-Token ViT: Training Vision Transformers from Scratch on ImageNet](https://arxiv.org/abs/2101.11986) ## Abstract - + Transformers, which are popular for language modeling, have been explored for solving vision tasks recently, \eg, the Vision Transformer (ViT) for image classification. The ViT model splits each image into a sequence of tokens with fixed length and then applies multiple Transformer layers to model their global relation for classification. However, ViT achieves inferior performance to CNNs when trained from scratch on a midsize dataset like ImageNet. We find it is because: 1) the simple tokenization of input images fails to model the important local structure such as edges and lines among neighboring pixels, leading to low training sample efficiency; 2) the redundant attention backbone design of ViT leads to limited feature richness for fixed computation budgets and limited training samples. To overcome such limitations, we propose a new Tokens-To-Token Vision Transformer (T2T-ViT), which incorporates 1) a layer-wise Tokens-to-Token (T2T) transformation to progressively structurize the image to tokens by recursively aggregating neighboring Tokens into one Token (Tokens-to-Token), such that local structure represented by surrounding tokens can be modeled and tokens length can be reduced; 2) an efficient backbone with a deep-narrow structure for vision transformer motivated by CNN architecture design after empirical study. Notably, T2T-ViT reduces the parameter count and MACs of vanilla ViT by half, while achieving more than 3.0\% improvement when trained from scratch on ImageNet. It also outperforms ResNets and achieves comparable performance with MobileNets by directly training on ImageNet. For example, T2T-ViT with comparable size to ResNet50 (21.5M parameters) can achieve 83.3\% top1 accuracy in image resolution 384×384 on ImageNet. -
-## Citation -```latex -@article{yuan2021tokens, - title={Tokens-to-token vit: Training vision transformers from scratch on imagenet}, - author={Yuan, Li and Chen, Yunpeng and Wang, Tao and Yu, Weihao and Shi, Yujun and Tay, Francis EH and Feng, Jiashi and Yan, Shuicheng}, - journal={arXiv preprint arXiv:2101.11986}, - year={2021} -} -``` - ## Results and models ### ImageNet-1k @@ -32,3 +22,14 @@ Transformers, which are popular for language modeling, have been explored for so | T2T-ViT_t-24 | 64.00 | 12.69 | 82.71 | 96.09 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/t2t_vit/t2t-vit-t-24_8xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_8xb64_in1k_20211214-b2a68ae3.pth) | [log](https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_8xb64_in1k_20211214-b2a68ae3.log.json)| *In consistent with the [official repo](https://github.com/yitu-opensource/T2T-ViT), we adopt the best checkpoints during training.* + +## Citation + +``` +@article{yuan2021tokens, + title={Tokens-to-token vit: Training vision transformers from scratch on imagenet}, + author={Yuan, Li and Chen, Yunpeng and Wang, Tao and Yu, Weihao and Shi, Yujun and Tay, Francis EH and Feng, Jiashi and Yan, Shuicheng}, + journal={arXiv preprint arXiv:2101.11986}, + year={2021} +} +``` diff --git a/configs/tnt/README.md b/configs/tnt/README.md index 69a408f1a18..72af174c0b3 100644 --- a/configs/tnt/README.md +++ b/configs/tnt/README.md @@ -1,17 +1,29 @@ -# Transformer in Transformer +# TNT +> [Transformer in Transformer](https://arxiv.org/abs/2103.00112) + ## Abstract - + Transformer is a new kind of neural architecture which encodes the input data as powerful features via the attention mechanism. Basically, the visual transformers first divide the input images into several local patches and then calculate both representations and their relationship. Since natural images are of high complexity with abundant detail and color information, the granularity of the patch dividing is not fine enough for excavating features of objects in different scales and locations. In this paper, we point out that the attention inside these local patches are also essential for building visual transformers with high performance and we explore a new architecture, namely, Transformer iN Transformer (TNT). Specifically, we regard the local patches (e.g., 16×16) as "visual sentences" and present to further divide them into smaller patches (e.g., 4×4) as "visual words". The attention of each word will be calculated with other words in the given visual sentence with negligible computational costs. Features of both words and sentences will be aggregated to enhance the representation ability. Experiments on several benchmarks demonstrate the effectiveness of the proposed TNT architecture, e.g., we achieve an 81.5% top-1 accuracy on the ImageNet, which is about 1.7% higher than that of the state-of-the-art visual transformer with similar computational cost. -
+## Results and models + +### ImageNet-1k + +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +|:-----------:|:---------:|:--------:|:---------:|:---------:|:------:|:--------:| +| TNT-small\* | 23.76 | 3.36 | 81.52 | 95.73 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/tnt/tnt-s-p16_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/tnt/tnt-small-p16_3rdparty_in1k_20210903-c56ee7df.pth) | + +*Models with \* are converted from [timm](https://github.com/rwightman/pytorch-image-models/). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + ## Citation -```latex + +``` @misc{han2021transformer, title={Transformer in Transformer}, author={Kai Han and An Xiao and Enhua Wu and Jianyuan Guo and Chunjing Xu and Yunhe Wang}, @@ -21,13 +33,3 @@ Transformer is a new kind of neural architecture which encodes the input data as primaryClass={cs.CV} } ``` - -## Results and models - -### ImageNet - -| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | -|:----------------------------------:|:---------:|:--------:|:---------:|:---------:|:------:|:--------:| -| Transformer in Transformer small\* | 23.76 | 3.36 | 81.52 | 95.73 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/tnt/tnt-s-p16_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/tnt/tnt-small-p16_3rdparty_in1k_20210903-c56ee7df.pth) | - -*Models with \* are converted from [timm](https://github.com/rwightman/pytorch-image-models/). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* diff --git a/configs/twins/README.md b/configs/twins/README.md new file mode 100644 index 00000000000..9eeeca08363 --- /dev/null +++ b/configs/twins/README.md @@ -0,0 +1,38 @@ +# Twins + +> [Twins: Revisiting the Design of Spatial Attention in Vision Transformers](http://arxiv-export-lb.library.cornell.edu/abs/2104.13840) + + +## Abstract + +Very recently, a variety of vision transformer architectures for dense prediction tasks have been proposed and they show that the design of spatial attention is critical to their success in these tasks. In this work, we revisit the design of the spatial attention and demonstrate that a carefully-devised yet simple spatial attention mechanism performs favourably against the state-of-the-art schemes. As a result, we propose two vision transformer architectures, namely, Twins-PCPVT and Twins-SVT. Our proposed architectures are highly-efficient and easy to implement, only involving matrix multiplications that are highly optimized in modern deep learning frameworks. More importantly, the proposed architectures achieve excellent performance on a wide range of visual tasks, including image level classification as well as dense detection and segmentation. The simplicity and strong performance suggest that our proposed architectures may serve as stronger backbones for many vision tasks. Our code is released at [this https URL](https://github.com/Meituan-AutoML/Twins). + +
+ +
+ +## Results and models + +### ImageNet-1k + +| Model | Params(M) | Flops(G) | Top-1 (%) | Top-5 (%) | Config | Download | +|:--------------:|:---------:|:--------:|:---------:|:---------:|:------:|:--------:| +| PCPVT-small\* | 24.11 | 3.67 | 81.14 | 95.69 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-small_3rdparty_8xb128_in1k_20220126-ef23c132.pth) | +| PCPVT-base\* | 43.83 | 6.45 | 82.66 | 96.26 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-base_3rdparty_8xb128_in1k_20220126-f8c4b0d5.pth) | +| PCPVT-large\* | 60.99 | 9.51 | 83.09 | 96.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-large_3rdparty_16xb64_in1k_20220126-c1ef8d80.pth) | +| SVT-small\* | 24.06 | 2.82 | 81.77 | 95.57 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-small_3rdparty_8xb128_in1k_20220126-8fe5205b.pth) | +| SVT-base\* | 56.07 | 8.35 | 83.13 | 96.29 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-base_3rdparty_8xb128_in1k_20220126-e31cc8e9.pth) | +| SVT-large\* | 99.27 | 14.82 | 83.60 | 96.50 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-large_3rdparty_16xb64_in1k_20220126-4817645f.pth) | + +*Models with \* are converted from [the official repo](https://github.com/Meituan-AutoML/Twins). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results. The validation accuracy is a little different from the official paper because of the PyTorch version. This result is get in PyTorch=1.9 while the official result is get in PyTorch=1.7* + +## Citation + +``` +@article{chu2021twins, + title={Twins: Revisiting spatial attention design in vision transformers}, + author={Chu, Xiangxiang and Tian, Zhi and Wang, Yuqing and Zhang, Bo and Ren, Haibing and Wei, Xiaolin and Xia, Huaxia and Shen, Chunhua}, + journal={arXiv preprint arXiv:2104.13840}, + year={2021}altgvt +} +``` diff --git a/configs/twins/metafile.yml b/configs/twins/metafile.yml new file mode 100644 index 00000000000..e3cf339873a --- /dev/null +++ b/configs/twins/metafile.yml @@ -0,0 +1,114 @@ +Collections: + - Name: Twins + Metadata: + Training Data: ImageNet-1k + Architecture: + - Global Subsampled Attention + - Locally Grouped SelfAttention + - Conditional Position Encoding + - Pyramid Vision Transformer + Paper: + URL: http://arxiv-export-lb.library.cornell.edu/abs/2104.13840 + Title: "Twins: Revisiting the Design of Spatial Attention in Vision Transformers" + README: configs/twins/README.md + Code: + URL: https://github.com/open-mmlab/mmclassification/blob/v0.20.0/mmcls/models/backbones/twins.py + Version: v0.20.0 + +Models: + - Name: twins-pcpvt-small_3rdparty_8xb128_in1k + Metadata: + FLOPs: 3670000000 # 3.67G + Parameters: 24110000 # 24.11M + In Collection: Twins + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 81.14 + Top 5 Accuracy: 95.69 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-small_3rdparty_8xb128_in1k_20220126-ef23c132.pth + Config: configs/twins/twins-pcpvt-small_8xb128_in1k.py + Converted From: + Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_small-e70e7e7a.pth + Code: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/twins.py + - Name: twins-pcpvt-base_3rdparty_8xb128_in1k + Metadata: + FLOPs: 6450000000 # 6.45G + Parameters: 43830000 # 43.83M + In Collection: Twins + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 82.66 + Top 5 Accuracy: 96.26 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-base_3rdparty_8xb128_in1k_20220126-f8c4b0d5.pth + Config: configs/twins/twins-pcpvt-base_8xb128_in1k.py + Converted From: + Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_small-e70e7e7a.pth + Code: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/twins.py + - Name: twins-pcpvt-large_3rdparty_16xb64_in1k + Metadata: + FLOPs: 9510000000 # 9.51G + Parameters: 60990000 # 60.99M + In Collection: Twins + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.09 + Top 5 Accuracy: 96.59 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-large_3rdparty_16xb64_in1k_20220126-c1ef8d80.pth + Config: configs/twins/twins-pcpvt-large_16xb64_in1k.py + Converted From: + Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_small-e70e7e7a.pth + Code: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/twins.py + - Name: twins-svt-small_3rdparty_8xb128_in1k + Metadata: + FLOPs: 2820000000 # 2.82G + Parameters: 24060000 # 24.06M + In Collection: Twins + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 81.77 + Top 5 Accuracy: 95.57 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-small_3rdparty_8xb128_in1k_20220126-8fe5205b.pth + Config: configs/twins/twins-svt-small_8xb128_in1k.py + Converted From: + Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_small-e70e7e7a.pth + Code: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/twins.py + - Name: twins-svt-base_8xb128_3rdparty_in1k + Metadata: + FLOPs: 8350000000 # 8.35G + Parameters: 56070000 # 56.07M + In Collection: Twins + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.13 + Top 5 Accuracy: 96.29 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-base_3rdparty_8xb128_in1k_20220126-e31cc8e9.pth + Config: configs/twins/twins-svt-base_8xb128_in1k.py + Converted From: + Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_small-e70e7e7a.pth + Code: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/twins.py + - Name: twins-svt-large_3rdparty_16xb64_in1k + Metadata: + FLOPs: 14820000000 # 14.82G + Parameters: 99270000 # 99.27M + In Collection: Twins + Results: + - Dataset: ImageNet-1k + Metrics: + Top 1 Accuracy: 83.60 + Top 5 Accuracy: 96.50 + Task: Image Classification + Weights: https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-large_3rdparty_16xb64_in1k_20220126-4817645f.pth + Config: configs/twins/twins-svt-large_16xb64_in1k.py + Converted From: + Weights: https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_small-e70e7e7a.pth + Code: https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/twins.py diff --git a/configs/twins/twins-pcpvt-base_8xb128_in1k.py b/configs/twins/twins-pcpvt-base_8xb128_in1k.py new file mode 100644 index 00000000000..8ea9adc3062 --- /dev/null +++ b/configs/twins/twins-pcpvt-base_8xb128_in1k.py @@ -0,0 +1,33 @@ +_base_ = [ + '../_base_/models/twins_pcpvt_base.py', + '../_base_/datasets/imagenet_bs64_swin_224.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +data = dict(samples_per_gpu=128) + +paramwise_cfg = dict(_delete=True, norm_decay_mult=0.0, bias_decay_mult=0.0) + +# for batch in each gpu is 128, 8 gpu +# lr = 5e-4 * 128 * 8 / 512 = 0.001 +optimizer = dict( + type='AdamW', + lr=5e-4 * 128 * 8 / 512, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=paramwise_cfg) +optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=5.0)) + +# learning policy +lr_config = dict( + policy='CosineAnnealing', + by_epoch=True, + min_lr_ratio=1e-2, + warmup='linear', + warmup_ratio=1e-3, + warmup_iters=5, + warmup_by_epoch=True) + +evaluation = dict(interval=1, metric='accuracy') diff --git a/configs/twins/twins-pcpvt-large_16xb64_in1k.py b/configs/twins/twins-pcpvt-large_16xb64_in1k.py new file mode 100644 index 00000000000..e9c9a35e873 --- /dev/null +++ b/configs/twins/twins-pcpvt-large_16xb64_in1k.py @@ -0,0 +1,5 @@ +_base_ = ['twins-pcpvt-base_8xb128_in1k.py'] + +model = dict(backbone=dict(arch='large'), head=dict(in_channels=512)) + +data = dict(samples_per_gpu=64) diff --git a/configs/twins/twins-pcpvt-small_8xb128_in1k.py b/configs/twins/twins-pcpvt-small_8xb128_in1k.py new file mode 100644 index 00000000000..cb8bdc38c78 --- /dev/null +++ b/configs/twins/twins-pcpvt-small_8xb128_in1k.py @@ -0,0 +1,3 @@ +_base_ = ['twins-pcpvt-base_8xb128_in1k.py'] + +model = dict(backbone=dict(arch='small'), head=dict(in_channels=512)) diff --git a/configs/twins/twins-svt-base_8xb128_in1k.py b/configs/twins/twins-svt-base_8xb128_in1k.py new file mode 100644 index 00000000000..e2db2301844 --- /dev/null +++ b/configs/twins/twins-svt-base_8xb128_in1k.py @@ -0,0 +1,33 @@ +_base_ = [ + '../_base_/models/twins_svt_base.py', + '../_base_/datasets/imagenet_bs64_swin_224.py', + '../_base_/schedules/imagenet_bs1024_adamw_swin.py', + '../_base_/default_runtime.py' +] + +data = dict(samples_per_gpu=128) + +paramwise_cfg = dict(_delete=True, norm_decay_mult=0.0, bias_decay_mult=0.0) + +# for batch in each gpu is 128, 8 gpu +# lr = 5e-4 * 128 * 8 / 512 = 0.001 +optimizer = dict( + type='AdamW', + lr=5e-4 * 128 * 8 / 512, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999), + paramwise_cfg=paramwise_cfg) +optimizer_config = dict(_delete_=True, grad_clip=dict(max_norm=5.0)) + +# learning policy +lr_config = dict( + policy='CosineAnnealing', + by_epoch=True, + min_lr_ratio=1e-2, + warmup='linear', + warmup_ratio=1e-3, + warmup_iters=5, + warmup_by_epoch=True) + +evaluation = dict(interval=1, metric='accuracy') diff --git a/configs/twins/twins-svt-large_16xb64_in1k.py b/configs/twins/twins-svt-large_16xb64_in1k.py new file mode 100644 index 00000000000..9288a706781 --- /dev/null +++ b/configs/twins/twins-svt-large_16xb64_in1k.py @@ -0,0 +1,5 @@ +_base_ = ['twins-svt-base_8xb128_in1k.py'] + +data = dict(samples_per_gpu=64) + +model = dict(backbone=dict(arch='large'), head=dict(in_channels=1024)) diff --git a/configs/twins/twins-svt-small_8xb128_in1k.py b/configs/twins/twins-svt-small_8xb128_in1k.py new file mode 100644 index 00000000000..b92f1d3f344 --- /dev/null +++ b/configs/twins/twins-svt-small_8xb128_in1k.py @@ -0,0 +1,3 @@ +_base_ = ['twins-svt-base_8xb128_in1k.py'] + +model = dict(backbone=dict(arch='small'), head=dict(in_channels=512)) diff --git a/configs/vgg/README.md b/configs/vgg/README.md index aecb78adc3e..7747de5894e 100644 --- a/configs/vgg/README.md +++ b/configs/vgg/README.md @@ -1,27 +1,16 @@ -# Very Deep Convolutional Networks for Large-Scale Image Recognition - +# VGG + +> [Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556) ## Abstract - + In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3x3) convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16-19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve state-of-the-art results. We have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision. -
-## Citation -```latex -@article{simonyan2014very, - title={Very deep convolutional networks for large-scale image recognition}, - author={Simonyan, Karen and Zisserman, Andrew}, - journal={arXiv preprint arXiv:1409.1556}, - year={2014} -} - -``` - ## Results and models ### ImageNet-1k @@ -36,3 +25,14 @@ In this work we investigate the effect of the convolutional network depth on its | VGG-13-BN | 133.05 | 11.36 | 72.12 | 90.66 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg13bn_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth) | [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.log.json) | | VGG-16-BN | 138.37 | 15.53 | 73.74 | 91.66 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg16_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth) | [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.log.json) | | VGG-19-BN | 143.68 | 19.7 | 74.68 | 92.27 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vgg/vgg19bn_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth) | [log](https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.log.json)| + +## Citation + +``` +@article{simonyan2014very, + title={Very deep convolutional networks for large-scale image recognition}, + author={Simonyan, Karen and Zisserman, Andrew}, + journal={arXiv preprint arXiv:1409.1556}, + year={2014} +} +``` diff --git a/configs/vision_transformer/README.md b/configs/vision_transformer/README.md index 570fa1b7764..ef6dc45bcde 100644 --- a/configs/vision_transformer/README.md +++ b/configs/vision_transformer/README.md @@ -1,28 +1,16 @@ -# An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale - +# Vision Transformer + +> [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/pdf/2010.11929.pdf) ## Abstract - + While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train. -
-## Citation -```latex -@inproceedings{ - dosovitskiy2021an, - title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, - author={Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby}, - booktitle={International Conference on Learning Representations}, - year={2021}, - url={https://openreview.net/forum?id=YicbFdNTTy} -} -``` - ## Results and models The training step of Vision Transformers is divided into two steps. The first @@ -52,3 +40,16 @@ The pre-trained models on ImageNet-21k are used to fine-tune, and therefore don' | ViT-L16\* | ImageNet-21k | 384x384 | 304.72 | 116.68 | 85.63 | 97.63 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/vision_transformer/vit-large-p16_ft-64xb64_in1k-384.py) | [model](https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth)| *Models with \* are converted from the [official repo](https://github.com/google-research/vision_transformer#available-vit-models). The config files of these models are only for validation. We don't ensure these config files' training accuracy and welcome you to contribute your reproduction results.* + +## Citation + +``` +@inproceedings{ + dosovitskiy2021an, + title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale}, + author={Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby}, + booktitle={International Conference on Learning Representations}, + year={2021}, + url={https://openreview.net/forum?id=YicbFdNTTy} +} +``` diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile index 564f17d05b2..eef06051f5a 100644 --- a/docker/serve/Dockerfile +++ b/docker/serve/Dockerfile @@ -4,7 +4,7 @@ ARG CUDNN="7" FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel ARG MMCV="1.4.2" -ARG MMCLS="0.19.0" +ARG MMCLS="0.20.0" ENV PYTHONUNBUFFERED TRUE diff --git a/docs/en/changelog.md b/docs/en/changelog.md index 020b034c1db..182880a864c 100644 --- a/docs/en/changelog.md +++ b/docs/en/changelog.md @@ -1,5 +1,43 @@ # Changelog +## v0.20.0(30/01/2022) + +### Highlights + +- Support K-fold cross-validation. The tutorial will be released later. +- Support HRNet, ConvNeXt, Twins and EfficientNet. +- Support model conversion from PyTorch to Core-ML by a tool. + +### New Features + +- Support K-fold cross-validation. ([#563](https://github.com/open-mmlab/mmclassification/pull/563)) +- Support HRNet and add pre-trained models. ([#660](https://github.com/open-mmlab/mmclassification/pull/660)) +- Support ConvNeXt and add pre-trained models. ([#670](https://github.com/open-mmlab/mmclassification/pull/670)) +- Support Twins and add pre-trained models. ([#642](https://github.com/open-mmlab/mmclassification/pull/642)) +- Support EfficientNet and add pre-trained models.([#649](https://github.com/open-mmlab/mmclassification/pull/649)) +- Support `features_only` option in `TIMMBackbone`. ([#668](https://github.com/open-mmlab/mmclassification/pull/668)) +- Add conversion script from pytorch to Core-ML model. ([#597](https://github.com/open-mmlab/mmclassification/pull/597)) + +### Improvements + +- New-style CPU training and inference. ([#674](https://github.com/open-mmlab/mmclassification/pull/674)) +- Add setup multi-processing both in train and test. ([#671](https://github.com/open-mmlab/mmclassification/pull/671)) +- Rewrite channel split operation in ShufflenetV2. ([#632](https://github.com/open-mmlab/mmclassification/pull/632)) +- Deprecate the support for "python setup.py test". ([#646](https://github.com/open-mmlab/mmclassification/pull/646)) +- Support single-label, softmax, custom eps by asymmetric loss. ([#609](https://github.com/open-mmlab/mmclassification/pull/609)) +- Save class names in best checkpoint created by evaluation hook. ([#641](https://github.com/open-mmlab/mmclassification/pull/641)) + +### Bug Fixes + +- Fix potential unexcepted behaviors if `metric_options` is not specified in multi-label evaluation. ([#647](https://github.com/open-mmlab/mmclassification/pull/647)) +- Fix API changes in `pytorch-grad-cam>=1.3.7`. ([#656](https://github.com/open-mmlab/mmclassification/pull/656)) +- Fix bug which breaks `cal_train_time` in `analyze_logs.py`. ([#662](https://github.com/open-mmlab/mmclassification/pull/662)) + +### Docs Update + +- Update README in configs according to OpenMMLab standard. ([#672](https://github.com/open-mmlab/mmclassification/pull/672)) +- Update installation guide and README. ([#624](https://github.com/open-mmlab/mmclassification/pull/624)) + ## v0.19.0(31/12/2021) ### Highlights diff --git a/docs/en/getting_started.md b/docs/en/getting_started.md index 9fcc5b1b602..782047471a3 100644 --- a/docs/en/getting_started.md +++ b/docs/en/getting_started.md @@ -58,6 +58,7 @@ python demo/image_demo.py demo/demo.JPEG configs/resnet/resnet50_8xb32_in1k.py \ ### Inference and test a dataset - single GPU +- CPU - single node multiple GPU - multiple node @@ -67,6 +68,10 @@ You can use the following commands to infer a dataset. # single-gpu python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--metrics ${METRICS}] [--out ${RESULT_FILE}] +# CPU: disable GPUs and run single-gpu testing script +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--metrics ${METRICS}] [--out ${RESULT_FILE}] + # multi-gpu ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--metrics ${METRICS}] [--out ${RESULT_FILE}] @@ -81,7 +86,6 @@ Optional arguments: Examples: -Assume that you have already downloaded the checkpoints to the directory `checkpoints/`. Infer ResNet-50 on ImageNet validation set to get predicted labels and their corresponding predicted scores. ```shell @@ -112,6 +116,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments] If you want to specify the working directory in the command, you can add an argument `--work_dir ${YOUR_WORK_DIR}`. +### Train with CPU + +The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process. + +```shell +export CUDA_VISIBLE_DEVICES=-1 +``` + +And then run the script [above](#train-with-a-single-gpu). + +```{warning} +The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process. +``` + ### Train with multiple GPUs ```shell diff --git a/docs/en/index.rst b/docs/en/index.rst index 3a29b7a9692..48069b6a7f6 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -26,7 +26,14 @@ You can switch between Chinese and English documentation in the lower-left corne tutorials/runtime.md -.. include:: _model_zoo.rst +.. toctree:: + :maxdepth: 1 + :caption: Model zoo + :glob: + + modelzoo_statistics.md + model_zoo.md + papers/* .. toctree:: @@ -55,6 +62,12 @@ You can switch between Chinese and English documentation in the lower-left corne api.rst +.. toctree:: + :maxdepth: 1 + :caption: Changelog + + changelog.md + .. toctree:: :caption: Language Switch diff --git a/docs/en/install.md b/docs/en/install.md index a41d7cc9464..079b85f6a23 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -10,8 +10,9 @@ The compatible MMClassification and MMCV versions are as below. Please install t | MMClassification version | MMCV version | |:------------------------:|:---------------------:| -| dev | mmcv>=1.3.16, <=1.5.0 | -| 0.19.0 (master) | mmcv>=1.3.16, <=1.5.0 | +| dev | mmcv>=1.4.4, <=1.5.0 | +| 0.20.0 (master) | mmcv>=1.3.16, <=1.5.0 | +| 0.19.0 | mmcv>=1.3.16, <=1.5.0 | | 0.18.0 | mmcv>=1.3.16, <=1.5.0 | | 0.17.0 | mmcv>=1.3.8, <=1.5.0 | | 0.16.0 | mmcv>=1.3.8, <=1.5.0 | diff --git a/docs/en/model_zoo.md b/docs/en/model_zoo.md index 42c69dff1d3..1e6299c01ab 100644 --- a/docs/en/model_zoo.md +++ b/docs/en/model_zoo.md @@ -83,6 +83,51 @@ The ResNet family models below are trained by standard data augmentations, i.e., | Conformer-small-p32\* | 38.85 | 7.09 | 81.96 | 96.02 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-small-p32_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p32_8xb128_in1k_20211206-947a0816.pth) | | Conformer-small-p16\* | 37.67 | 10.31 | 83.32 | 96.46 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-small-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-small-p16_3rdparty_8xb128_in1k_20211206-3065dcf5.pth) | | Conformer-base-p16\* | 83.29 | 22.89 | 83.82 | 96.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/conformer/conformer-base-p16_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/conformer/conformer-base-p16_3rdparty_8xb128_in1k_20211206-bfdf8637.pth) | +| PCPVT-small\* | 24.11 | 3.67 | 81.14 | 95.69 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-small_3rdparty_8xb128_in1k_20220126-ef23c132.pth) | +| PCPVT-base\* | 43.83 | 6.45 | 82.66 | 96.26 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-base_3rdparty_8xb128_in1k_20220126-f8c4b0d5.pth) | +| PCPVT-large\* | 60.99 | 9.51 | 83.09 | 96.59 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-pcpvt-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-pcpvt-large_3rdparty_16xb64_in1k_20220126-c1ef8d80.pth) | +| SVT-small\* | 24.06 | 2.82 | 81.77 | 95.57 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-small_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-small_3rdparty_8xb128_in1k_20220126-8fe5205b.pth) | +| SVT-base\* | 56.07 | 8.35 | 83.13 | 96.29 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-base_8xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-base_3rdparty_8xb128_in1k_20220126-e31cc8e9.pth) | +| SVT-large\* | 99.27 | 14.82 | 83.60 | 96.50 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/twins/twins-svt-large_16xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/twins/twins-svt-large_3rdparty_16xb64_in1k_20220126-4817645f.pth) | +| EfficientNet-B0\* | 5.29 | 0.02 | 76.74 | 93.17 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32_in1k_20220119-a7e2a0b1.pth) | +| EfficientNet-B0 (AA)\* | 5.29 | 0.02 | 77.26 | 93.41 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa_in1k_20220119-8d939117.pth) | +| EfficientNet-B0 (AA + AdvProp)\* | 5.29 | 0.02 | 77.53 | 93.61 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b0_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b0_3rdparty_8xb32-aa-advprop_in1k_20220119-26434485.pth) | +| EfficientNet-B1\* | 7.79 | 0.03 | 78.68 | 94.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32_in1k_20220119-002556d9.pth) | +| EfficientNet-B1 (AA)\* | 7.79 | 0.03 | 79.20 | 94.42 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa_in1k_20220119-619d8ae3.pth) | +| EfficientNet-B1 (AA + AdvProp)\* | 7.79 | 0.03 | 79.52 | 94.43 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b1_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b1_3rdparty_8xb32-aa-advprop_in1k_20220119-5715267d.pth) | +| EfficientNet-B2\* | 9.11 | 0.03 | 79.64 | 94.80 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32_in1k_20220119-ea374a30.pth) | +| EfficientNet-B2 (AA)\* | 9.11 | 0.03 | 80.21 | 94.96 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa_in1k_20220119-dd61e80b.pth) | +| EfficientNet-B2 (AA + AdvProp)\* | 9.11 | 0.03 | 80.45 | 95.07 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b2_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b2_3rdparty_8xb32-aa-advprop_in1k_20220119-1655338a.pth) | +| EfficientNet-B3\* | 12.23 | 0.06 | 81.01 | 95.34 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32_in1k_20220119-4b4d7487.pth) | +| EfficientNet-B3 (AA)\* | 12.23 | 0.06 | 81.58 | 95.67 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa_in1k_20220119-5b4887a0.pth) | +| EfficientNet-B3 (AA + AdvProp)\* | 12.23 | 0.06 | 81.81 | 95.69 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b3_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b3_3rdparty_8xb32-aa-advprop_in1k_20220119-53b41118.pth) | +| EfficientNet-B4\* | 19.34 | 0.12 | 82.57 | 96.09 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32_in1k_20220119-81fd4077.pth) | +| EfficientNet-B4 (AA)\* | 19.34 | 0.12 | 82.95 | 96.26 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa_in1k_20220119-45b8bd2b.pth) | +| EfficientNet-B4 (AA + AdvProp)\* | 19.34 | 0.12 | 83.25 | 96.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b4_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b4_3rdparty_8xb32-aa-advprop_in1k_20220119-38c2238c.pth) | +| EfficientNet-B5\* | 30.39 | 0.24 | 83.18 | 96.47 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32_in1k_20220119-e9814430.pth) | +| EfficientNet-B5 (AA)\* | 30.39 | 0.24 | 83.82 | 96.76 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa_in1k_20220119-2cab8b78.pth) | +| EfficientNet-B5 (AA + AdvProp)\* | 30.39 | 0.24 | 84.21 | 96.98 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b5_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b5_3rdparty_8xb32-aa-advprop_in1k_20220119-f57a895a.pth) | +| EfficientNet-B6 (AA)\* | 43.04 | 0.41 | 84.05 | 96.82 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b6_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa_in1k_20220119-45b03310.pth) | +| EfficientNet-B6 (AA + AdvProp)\* | 43.04 | 0.41 | 84.74 | 97.14 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b6_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b6_3rdparty_8xb32-aa-advprop_in1k_20220119-bfe3485e.pth) | +| EfficientNet-B7 (AA)\* | 66.35 | 0.72 | 84.38 | 96.88 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b7_8xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa_in1k_20220119-bf03951c.pth) | +| EfficientNet-B7 (AA + AdvProp)\* | 66.35 | 0.72 | 85.14 | 97.23 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b7_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b7_3rdparty_8xb32-aa-advprop_in1k_20220119-c6dbff10.pth) | +| EfficientNet-B8 (AA + AdvProp)\* | 87.41 | 1.09 | 85.38 | 97.28 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/efficientnet/efficientnet-b8_8xb32-01norm_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/efficientnet/efficientnet-b8_3rdparty_8xb32-aa-advprop_in1k_20220119-297ce1b7.pth) | +| ConvNeXt-T\* | 28.59 | 4.46 | 82.05 | 95.86 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-tiny_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-tiny_3rdparty_32xb128_in1k_20220124-18abde00.pth) | +| ConvNeXt-S\* | 50.22 | 8.69 | 83.13 | 96.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-small_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-small_3rdparty_32xb128_in1k_20220124-d39b5192.pth) | +| ConvNeXt-B\* | 88.59 | 15.36 | 83.85 | 96.74 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_3rdparty_32xb128_in1k_20220124-d0915162.pth) | +| ConvNeXt-B\* | 88.59 | 15.36 | 85.81 | 97.86 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-base_32xb128_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-base_in21k-pre-3rdparty_32xb128_in1k_20220124-eb2d6ada.pth) | +| ConvNeXt-L\* | 197.77 | 34.37 | 84.30 | 96.89 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_3rdparty_64xb64_in1k_20220124-f8a0ded0.pth) | +| ConvNeXt-L\* | 197.77 | 34.37 | 86.61 | 98.04 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-large_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-large_in21k-pre-3rdparty_64xb64_in1k_20220124-2412403d.pth) | +| ConvNeXt-XL\* | 350.20 | 60.93 | 86.97 | 98.20 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/convnext/convnext-xlarge_64xb64_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/convnext/convnext-xlarge_in21k-pre-3rdparty_64xb64_in1k_20220124-76b6863d.pth) | +| HRNet-W18\* | 21.30 | 4.33 | 76.75 | 93.44 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w18_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32_in1k_20220120-0c10b180.pth) | +| HRNet-W30\* | 37.71 | 8.17 | 78.19 | 94.22 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w30_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w30_3rdparty_8xb32_in1k_20220120-8aa3832f.pth) | +| HRNet-W32\* | 41.23 | 8.99 | 78.44 | 94.19 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w32_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w32_3rdparty_8xb32_in1k_20220120-c394f1ab.pth) | +| HRNet-W40\* | 57.55 | 12.77 | 78.94 | 94.47 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w40_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w40_3rdparty_8xb32_in1k_20220120-9a2dbfc5.pth) | +| HRNet-W44\* | 67.06 | 14.96 | 78.88 | 94.37 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w44_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w44_3rdparty_8xb32_in1k_20220120-35d07f73.pth) | +| HRNet-W48\* | 77.47 | 17.36 | 79.32 | 94.52 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w48_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32_in1k_20220120-e555ef50.pth) | +| HRNet-W64\* | 128.06 | 29.00 | 79.46 | 94.65 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w64_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w64_3rdparty_8xb32_in1k_20220120-19126642.pth) | +| HRNet-W18 (ssld)\* | 21.30 | 4.33 | 81.06 | 95.70 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w18_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w18_3rdparty_8xb32-ssld_in1k_20220120-455f69ea.pth) | +| HRNet-W48 (ssld)\* | 77.47 | 17.36 | 83.63 | 96.79 | [config](https://github.com/open-mmlab/mmclassification/blob/master/configs/hrnet/hrnet-w48_4xb32_in1k.py) | [model](https://download.openmmlab.com/mmclassification/v0/hrnet/hrnet-w48_3rdparty_8xb32-ssld_in1k_20220120-d0459c38.pth) | *Models with \* are converted from other repos, others are trained by ourselves.* diff --git a/docs/en/stat.py b/docs/en/stat.py index 7d47b07f8f5..8f1e5b2d522 100755 --- a/docs/en/stat.py +++ b/docs/en/stat.py @@ -31,12 +31,21 @@ num_ckpts += len(ckpts) # Extract paper title - title = content.split('\n')[0].replace('# ', '').strip() + match_res = list(re.finditer(r'> \[(.*)\]\((.*)\)', content)) + if len(match_res) > 0: + title, paperlink = match_res[0].groups() + else: + title = content.split('\n')[0].replace('# ', '').strip() + paperlink = None titles.append(title) - # Extract paper abbreviation - abbr = [x for x in re.findall(r'', content)] - abbr = abbr[0] if len(abbr) > 0 else title + # Replace paper link to a button + if paperlink is not None: + start = match_res[0].start() + end = match_res[0].end() + # link_button = f'{title}' + link_button = f'[{title}]({paperlink})' + content = content[:start] + link_button + content[end:] # Extract paper type _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)] @@ -66,9 +75,7 @@ def replace_link(matchobj): statsmsg = f""" \t* [{papertype}] [{title}]({copy}) ({len(ckpts)} ckpts) """ - stats.append( - dict( - paper=paper, ckpts=ckpts, statsmsg=statsmsg, abbr=abbr, copy=copy)) + stats.append(dict(paper=paper, ckpts=ckpts, statsmsg=statsmsg, copy=copy)) allpapers = func.reduce(lambda a, b: a.union(b), [stat['paper'] for stat in stats]) @@ -91,17 +98,3 @@ def replace_link(matchobj): with open('modelzoo_statistics.md', 'w') as f: f.write(modelzoo) - -toctree = """ -.. toctree:: - :maxdepth: 1 - :caption: Model zoo - :glob: - - modelzoo_statistics.md - model_zoo.md -""" -with open('_model_zoo.rst', 'w') as f: - f.write(toctree) - for stat in stats: - f.write(f' {stat["abbr"]} <{stat["copy"]}>\n') diff --git a/docs/en/tutorials/finetune.md b/docs/en/tutorials/finetune.md index 2e0b9155e28..22954e1e129 100644 --- a/docs/en/tutorials/finetune.md +++ b/docs/en/tutorials/finetune.md @@ -5,7 +5,7 @@ This tutorial provides instructions for users to use the models provided in the There are two steps to fine-tune a model on a new dataset. -- Add support for the new dataset following [Tutorial 2: Adding New Dataset](new_dataset.md). +- Add support for the new dataset following [Tutorial 3: Adding New Dataset](new_dataset.md). - Modify the configs as will be discussed in this tutorial. Assume we have a ResNet-50 model pre-trained on the ImageNet-2012 dataset and want diff --git a/docs/zh_CN/changelog.md b/docs/zh_CN/changelog.md new file mode 120000 index 00000000000..6b731cd0d50 --- /dev/null +++ b/docs/zh_CN/changelog.md @@ -0,0 +1 @@ +../en/changelog.md \ No newline at end of file diff --git a/docs/zh_CN/getting_started.md b/docs/zh_CN/getting_started.md index 5edfab7f7aa..26896cc9c18 100644 --- a/docs/zh_CN/getting_started.md +++ b/docs/zh_CN/getting_started.md @@ -58,6 +58,7 @@ python demo/image_demo.py demo/demo.JPEG configs/resnet/resnet50_8xb32_in1k.py \ ### 数据集的推理与测试 - 支持单 GPU +- 支持 CPU - 支持单节点多 GPU - 支持多节点 @@ -67,6 +68,10 @@ python demo/image_demo.py demo/demo.JPEG configs/resnet/resnet50_8xb32_in1k.py \ # 单 GPU python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--metrics ${METRICS}] [--out ${RESULT_FILE}] +# CPU: 禁用 GPU 并运行单 GPU 测试脚本 +export CUDA_VISIBLE_DEVICES=-1 +python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--metrics ${METRICS}] [--out ${RESULT_FILE}] + # 多 GPU ./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--metrics ${METRICS}] [--out ${RESULT_FILE}] @@ -81,8 +86,6 @@ python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--metrics ${METRICS}] [- 例子: -假定用户将下载的模型权重文件放置在 `checkpoints/` 目录下。 - 在 ImageNet 验证集上,使用 ResNet-50 进行推理并获得预测标签及其对应的预测得分。 ```shell @@ -111,6 +114,20 @@ python tools/train.py ${CONFIG_FILE} [optional arguments] 如果用户想在命令中指定工作目录,则需要增加参数 `--work-dir ${YOUR_WORK_DIR}` +### 使用 CPU 训练 + +使用 CPU 训练的流程和使用单 GPU 训练的流程一致,我们仅需要在训练流程开始前禁用 GPU。 + +```shell +export CUDA_VISIBLE_DEVICES=-1 +``` + +之后运行单 GPU 训练脚本即可。 + +```{warning} +我们不推荐用户使用 CPU 进行训练,这太过缓慢。我们支持这个功能是为了方便用户在没有 GPU 的机器上进行调试。 +``` + ### 使用多个 GPU 进行训练 ```shell @@ -150,7 +167,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./tools/dist_train.sh ${CONFIG_FILE} 4 CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 ./tools/dist_train.sh ${CONFIG_FILE} 4 ``` -如果用户在 slurm 集群下启动多个训练任务,则需要修改配置文件(通常是配置文件的倒数第 6 行)中的 `dist_params` 变量,以设置不同的通信端口。 +如果用户在 slurm 集群下启动多个训练任务,则需要修改配置文件中的 `dist_params` 变量,以设置不同的通信端口。 在 `config1.py` 中, diff --git a/docs/zh_CN/index.rst b/docs/zh_CN/index.rst index b76c32745d8..68226e9d6a4 100644 --- a/docs/zh_CN/index.rst +++ b/docs/zh_CN/index.rst @@ -26,7 +26,14 @@ You can switch between Chinese and English documentation in the lower-left corne tutorials/runtime.md -.. include:: _model_zoo.rst +.. toctree:: + :maxdepth: 1 + :caption: 模型库 + :glob: + + modelzoo_statistics.md + model_zoo.md + papers/* .. toctree:: @@ -55,6 +62,13 @@ You can switch between Chinese and English documentation in the lower-left corne api.rst +.. toctree:: + :maxdepth: 1 + :caption: 更新日志 + + changelog.md + + .. toctree:: :caption: 语言切换 diff --git a/docs/zh_CN/install.md b/docs/zh_CN/install.md index 4754f9bb986..3380fb61360 100644 --- a/docs/zh_CN/install.md +++ b/docs/zh_CN/install.md @@ -10,8 +10,9 @@ MMClassification 和 MMCV 的适配关系如下,请安装正确版本的 MMCV | MMClassification 版本 | MMCV 版本 | |:---------------------:|:---------------------:| -| dev | mmcv>=1.3.16, <=1.5.0 | -| 0.19.0 (master)| mmcv>=1.3.16, <=1.5.0 | +| dev | mmcv>=1.4.4, <=1.5.0 | +| 0.20.0 (master)| mmcv>=1.3.16, <=1.5.0 | +| 0.19.0 | mmcv>=1.3.16, <=1.5.0 | | 0.18.0 | mmcv>=1.3.16, <=1.5.0 | | 0.17.0 | mmcv>=1.3.8, <=1.5.0 | | 0.16.0 | mmcv>=1.3.8, <=1.5.0 | diff --git a/docs/zh_CN/stat.py b/docs/zh_CN/stat.py index 079a241542d..f6d5b3ab636 100755 --- a/docs/zh_CN/stat.py +++ b/docs/zh_CN/stat.py @@ -31,12 +31,20 @@ num_ckpts += len(ckpts) # Extract paper title - title = content.split('\n')[0].replace('# ', '').strip() + match_res = list(re.finditer(r'> \[(.*)\]\((.*)\)', content)) + if len(match_res) > 0: + title, paperlink = match_res[0].groups() + else: + title = content.split('\n')[0].replace('# ', '').strip() + paperlink = None titles.append(title) - # Extract paper abbreviation - abbr = [x for x in re.findall(r'', content)] - abbr = abbr[0] if len(abbr) > 0 else title + # Replace paper link to a button + if paperlink is not None: + start = match_res[0].start() + end = match_res[0].end() + link_button = f'[{title}]({paperlink})' + content = content[:start] + link_button + content[end:] # Extract paper type _papertype = [x for x in re.findall(r'\[([A-Z]+)\]', content)] @@ -66,9 +74,7 @@ def replace_link(matchobj): statsmsg = f""" \t* [{papertype}] [{title}]({copy}) ({len(ckpts)} ckpts) """ - stats.append( - dict( - paper=paper, ckpts=ckpts, statsmsg=statsmsg, abbr=abbr, copy=copy)) + stats.append(dict(paper=paper, ckpts=ckpts, statsmsg=statsmsg, copy=copy)) allpapers = func.reduce(lambda a, b: a.union(b), [stat['paper'] for stat in stats]) @@ -91,17 +97,3 @@ def replace_link(matchobj): with open('modelzoo_statistics.md', 'w') as f: f.write(modelzoo) - -toctree = """ -.. toctree:: - :maxdepth: 1 - :caption: 模型库 - :glob: - - modelzoo_statistics.md - model_zoo.md -""" -with open('_model_zoo.rst', 'w') as f: - f.write(toctree) - for stat in stats: - f.write(f' {stat["abbr"]} <{stat["copy"]}>\n') diff --git a/docs/zh_CN/tutorials/finetune.md b/docs/zh_CN/tutorials/finetune.md index 2901506587a..d0611faf853 100644 --- a/docs/zh_CN/tutorials/finetune.md +++ b/docs/zh_CN/tutorials/finetune.md @@ -6,7 +6,7 @@ 在新数据集上微调模型分为两步: -- 按照 [教程 2:如何增加新数据集](new_dataset.md) 添加对新数据集的支持。 +- 按照 [教程 3:如何增加新数据集](new_dataset.md) 添加对新数据集的支持。 - 按照本教程中讨论的内容修改配置文件 假设我们现在有一个在 ImageNet-2012 数据集上训练好的 ResNet-50 模型,并且希望在 diff --git a/mmcls/apis/train.py b/mmcls/apis/train.py index 2232b201313..6c4663c281f 100644 --- a/mmcls/apis/train.py +++ b/mmcls/apis/train.py @@ -89,7 +89,7 @@ def train_model(model, distributed=False, validate=False, timestamp=None, - device='cuda', + device=None, meta=None): logger = get_root_logger() @@ -122,13 +122,19 @@ def train_model(model, broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: - if device == 'cuda': - model = MMDataParallel( - model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) - elif device == 'cpu': + if device == 'cpu': + warnings.warn( + 'The argument `device` is deprecated. To use cpu to train, ' + 'please refers to https://mmclassification.readthedocs.io/en' + '/latest/getting_started.html#train-a-model') model = model.cpu() else: - raise ValueError(F'unsupported device name {device}.') + model = MMDataParallel(model, device_ids=cfg.gpu_ids) + if not model.device_ids: + from mmcv import digit_version, __version__ + assert digit_version(__version__) >= (1, 4, 4), \ + 'To train with CPU, please confirm your mmcv version ' \ + 'is not lower than v1.4.4' # build runner optimizer = build_optimizer(model, cfg.optimizer) diff --git a/mmcls/datasets/__init__.py b/mmcls/datasets/__init__.py index 64fd5ba5516..167fef5cf3d 100644 --- a/mmcls/datasets/__init__.py +++ b/mmcls/datasets/__init__.py @@ -4,7 +4,7 @@ build_dataset, build_sampler) from .cifar import CIFAR10, CIFAR100 from .dataset_wrappers import (ClassBalancedDataset, ConcatDataset, - RepeatDataset) + KFoldDataset, RepeatDataset) from .imagenet import ImageNet from .imagenet21k import ImageNet21k from .mnist import MNIST, FashionMNIST @@ -17,5 +17,5 @@ 'VOC', 'MultiLabelDataset', 'build_dataloader', 'build_dataset', 'DistributedSampler', 'ConcatDataset', 'RepeatDataset', 'ClassBalancedDataset', 'DATASETS', 'PIPELINES', 'ImageNet21k', 'SAMPLERS', - 'build_sampler', 'RepeatAugSampler' + 'build_sampler', 'RepeatAugSampler', 'KFoldDataset' ] diff --git a/mmcls/datasets/base_dataset.py b/mmcls/datasets/base_dataset.py index 3c9edf15b2e..7a2f310925a 100644 --- a/mmcls/datasets/base_dataset.py +++ b/mmcls/datasets/base_dataset.py @@ -118,6 +118,7 @@ def evaluate(self, results, metric='accuracy', metric_options=None, + indices=None, logger=None): """Evaluate the dataset. @@ -128,6 +129,8 @@ def evaluate(self, metric_options (dict, optional): Options for calculating metrics. Allowed keys are 'topk', 'thrs' and 'average_mode'. Defaults to None. + indices (list, optional): The indices of samples corresponding to + the results. Defaults to None. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Defaults to None. Returns: @@ -145,6 +148,8 @@ def evaluate(self, eval_results = {} results = np.vstack(results) gt_labels = self.get_gt_labels() + if indices is not None: + gt_labels = gt_labels[indices] num_imgs = len(results) assert len(gt_labels) == num_imgs, 'dataset testing results should '\ 'be of the same length as gt_labels.' diff --git a/mmcls/datasets/builder.py b/mmcls/datasets/builder.py index cae66fa9937..544f64d7d8e 100644 --- a/mmcls/datasets/builder.py +++ b/mmcls/datasets/builder.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import copy import platform import random from functools import partial @@ -25,7 +26,7 @@ def build_dataset(cfg, default_args=None): from .dataset_wrappers import (ConcatDataset, RepeatDataset, - ClassBalancedDataset) + ClassBalancedDataset, KFoldDataset) if isinstance(cfg, (list, tuple)): dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) elif cfg['type'] == 'RepeatDataset': @@ -34,6 +35,13 @@ def build_dataset(cfg, default_args=None): elif cfg['type'] == 'ClassBalancedDataset': dataset = ClassBalancedDataset( build_dataset(cfg['dataset'], default_args), cfg['oversample_thr']) + elif cfg['type'] == 'KFoldDataset': + cp_cfg = copy.deepcopy(cfg) + if cp_cfg.get('test_mode', None) is None: + cp_cfg['test_mode'] = (default_args or {}).pop('test_mode', False) + cp_cfg['dataset'] = build_dataset(cp_cfg['dataset'], default_args) + cp_cfg.pop('type') + dataset = KFoldDataset(**cp_cfg) else: dataset = build_from_cfg(cfg, DATASETS, default_args) diff --git a/mmcls/datasets/dataset_wrappers.py b/mmcls/datasets/dataset_wrappers.py index 68c234e2f27..745c8f149af 100644 --- a/mmcls/datasets/dataset_wrappers.py +++ b/mmcls/datasets/dataset_wrappers.py @@ -170,3 +170,56 @@ def __getitem__(self, idx): def __len__(self): return len(self.repeat_indices) + + +@DATASETS.register_module() +class KFoldDataset: + """A wrapper of dataset for K-Fold cross-validation. + + K-Fold cross-validation divides all the samples in groups of samples, + called folds, of almost equal sizes. And we use k-1 of folds to do training + and use the fold left to do validation. + + Args: + dataset (:obj:`CustomDataset`): The dataset to be divided. + fold (int): The fold used to do validation. Defaults to 0. + num_splits (int): The number of all folds. Defaults to 5. + test_mode (bool): Use the training dataset or validation dataset. + Defaults to False. + seed (int, optional): The seed to shuffle the dataset before splitting. + If None, not shuffle the dataset. Defaults to None. + """ + + def __init__(self, + dataset, + fold=0, + num_splits=5, + test_mode=False, + seed=None): + self.dataset = dataset + self.CLASSES = dataset.CLASSES + self.test_mode = test_mode + self.num_splits = num_splits + + length = len(dataset) + indices = list(range(length)) + if isinstance(seed, int): + rng = np.random.default_rng(seed) + rng.shuffle(indices) + + test_start = length * fold // num_splits + test_end = length * (fold + 1) // num_splits + if test_mode: + self.indices = indices[test_start:test_end] + else: + self.indices = indices[:test_start] + indices[test_end:] + + def __getitem__(self, idx): + return self.dataset[self.indices[idx]] + + def __len__(self): + return len(self.indices) + + def evaluate(self, *args, **kwargs): + kwargs['indices'] = self.indices + return self.dataset.evaluate(*args, **kwargs) diff --git a/mmcls/datasets/multi_label.py b/mmcls/datasets/multi_label.py index 702493e3a48..d4d080535e7 100644 --- a/mmcls/datasets/multi_label.py +++ b/mmcls/datasets/multi_label.py @@ -28,6 +28,7 @@ def evaluate(self, results, metric='mAP', metric_options=None, + indices=None, logger=None, **deprecated_kwargs): """Evaluate the dataset. @@ -46,7 +47,7 @@ def evaluate(self, Returns: dict: evaluation results """ - if metric_options is None: + if metric_options is None or metric_options == {}: metric_options = {'thr': 0.5} if deprecated_kwargs != {}: @@ -62,6 +63,8 @@ def evaluate(self, eval_results = {} results = np.vstack(results) gt_labels = self.get_gt_labels() + if indices is not None: + gt_labels = gt_labels[indices] num_imgs = len(results) assert len(gt_labels) == num_imgs, 'dataset testing results should '\ 'be of the same length as gt_labels.' diff --git a/mmcls/models/backbones/__init__.py b/mmcls/models/backbones/__init__.py index faa7927f377..bc5e5b01f3f 100644 --- a/mmcls/models/backbones/__init__.py +++ b/mmcls/models/backbones/__init__.py @@ -1,7 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. from .alexnet import AlexNet from .conformer import Conformer +from .convnext import ConvNeXt from .deit import DistilledVisionTransformer +from .efficientnet import EfficientNet +from .hrnet import HRNet from .lenet import LeNet5 from .mlp_mixer import MlpMixer from .mobilenet_v2 import MobileNetV2 @@ -21,6 +24,7 @@ from .t2t_vit import T2T_ViT from .timm_backbone import TIMMBackbone from .tnt import TNT +from .twins import PCPVT, SVT from .vgg import VGG from .vision_transformer import VisionTransformer @@ -29,5 +33,6 @@ 'ResNeSt', 'ResNet_CIFAR', 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'MobileNetV2', 'MobileNetV3', 'VisionTransformer', 'SwinTransformer', 'TNT', 'TIMMBackbone', 'T2T_ViT', 'Res2Net', 'RepVGG', - 'Conformer', 'MlpMixer', 'DistilledVisionTransformer' + 'Conformer', 'MlpMixer', 'DistilledVisionTransformer', 'PCPVT', 'SVT', + 'EfficientNet', 'ConvNeXt', 'HRNet' ] diff --git a/mmcls/models/backbones/convnext.py b/mmcls/models/backbones/convnext.py new file mode 100644 index 00000000000..6d61ec95425 --- /dev/null +++ b/mmcls/models/backbones/convnext.py @@ -0,0 +1,331 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from functools import partial +from itertools import chain +from typing import Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn.bricks import (NORM_LAYERS, DropPath, build_activation_layer, + build_norm_layer) +from mmcv.runner import BaseModule +from mmcv.runner.base_module import ModuleList, Sequential + +from ..builder import BACKBONES +from .base_backbone import BaseBackbone + + +@NORM_LAYERS.register_module('LN2d') +class LayerNorm2d(nn.LayerNorm): + """LayerNorm on channels for 2d images. + + Args: + num_channels (int): The number of channels of the input tensor. + eps (float): a value added to the denominator for numerical stability. + Defaults to 1e-5. + elementwise_affine (bool): a boolean value that when set to ``True``, + this module has learnable per-element affine parameters initialized + to ones (for weights) and zeros (for biases). Defaults to True. + """ + + def __init__(self, num_channels: int, **kwargs) -> None: + super().__init__(num_channels, **kwargs) + self.num_channels = self.normalized_shape[0] + + def forward(self, x): + assert x.dim() == 4, 'LayerNorm2d only supports inputs with shape ' \ + f'(N, C, H, W), but got tensor with shape {x.shape}' + return F.layer_norm( + x.permute(0, 2, 3, 1), self.normalized_shape, self.weight, + self.bias, self.eps).permute(0, 3, 1, 2) + + +class ConvNeXtBlock(BaseModule): + """ConvNeXt Block. + + Args: + in_channels (int): The number of input channels. + norm_cfg (dict): The config dict for norm layers. + Defaults to ``dict(type='LN2d', eps=1e-6)``. + act_cfg (dict): The config dict for activation between pointwise + convolution. Defaults to ``dict(type='GELU')``. + mlp_ratio (float): The expansion ratio in both pointwise convolution. + Defaults to 4. + linear_pw_conv (bool): Whether to use linear layer to do pointwise + convolution. More details can be found in the note. + Defaults to True. + drop_path_rate (float): Stochastic depth rate. Defaults to 0. + layer_scale_init_value (float): Init value for Layer Scale. + Defaults to 1e-6. + + Note: + There are two equivalent implementations: + + 1. DwConv -> LayerNorm -> 1x1 Conv -> GELU -> 1x1 Conv; + all outputs are in (N, C, H, W). + 2. DwConv -> LayerNorm -> Permute to (N, H, W, C) -> Linear -> GELU + -> Linear; Permute back + + As default, we use the second to align with the official repository. + And it may be slightly faster. + """ + + def __init__(self, + in_channels, + norm_cfg=dict(type='LN2d', eps=1e-6), + act_cfg=dict(type='GELU'), + mlp_ratio=4., + linear_pw_conv=True, + drop_path_rate=0., + layer_scale_init_value=1e-6): + super().__init__() + self.depthwise_conv = nn.Conv2d( + in_channels, + in_channels, + kernel_size=7, + padding=3, + groups=in_channels) + + self.linear_pw_conv = linear_pw_conv + self.norm = build_norm_layer(norm_cfg, in_channels)[1] + + mid_channels = int(mlp_ratio * in_channels) + if self.linear_pw_conv: + # Use linear layer to do pointwise conv. + pw_conv = nn.Linear + else: + pw_conv = partial(nn.Conv2d, kernel_size=1) + + self.pointwise_conv1 = pw_conv(in_channels, mid_channels) + self.act = build_activation_layer(act_cfg) + self.pointwise_conv2 = pw_conv(mid_channels, in_channels) + + self.gamma = nn.Parameter( + layer_scale_init_value * torch.ones((in_channels)), + requires_grad=True) if layer_scale_init_value > 0 else None + + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate > 0. else nn.Identity() + + def forward(self, x): + shortcut = x + x = self.depthwise_conv(x) + x = self.norm(x) + + if self.linear_pw_conv: + x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + + x = self.pointwise_conv1(x) + x = self.act(x) + x = self.pointwise_conv2(x) + + if self.linear_pw_conv: + x = x.permute(0, 3, 1, 2) # permute back + + if self.gamma is not None: + x = x.mul(self.gamma.view(1, -1, 1, 1)) + + x = shortcut + self.drop_path(x) + return x + + +@BACKBONES.register_module() +class ConvNeXt(BaseBackbone): + """ConvNeXt. + + A PyTorch implementation of : `A ConvNet for the 2020s + `_ + + Modified from the `official repo + `_ + and `timm + `_. + + Args: + arch (str | dict): The model's architecture. If string, it should be + one of architecture in ``ConvNeXt.arch_settings``. And if dict, it + should include the following two keys: + + - depths (list[int]): Number of blocks at each stage. + - channels (list[int]): The number of channels at each stage. + + Defaults to 'tiny'. + in_channels (int): Number of input image channels. Defaults to 3. + stem_patch_size (int): The size of one patch in the stem layer. + Defaults to 4. + norm_cfg (dict): The config dict for norm layers. + Defaults to ``dict(type='LN2d', eps=1e-6)``. + act_cfg (dict): The config dict for activation between pointwise + convolution. Defaults to ``dict(type='GELU')``. + linear_pw_conv (bool): Whether to use linear layer to do pointwise + convolution. Defaults to True. + drop_path_rate (float): Stochastic depth rate. Defaults to 0. + layer_scale_init_value (float): Init value for Layer Scale. + Defaults to 1e-6. + out_indices (Sequence | int): Output from which stages. + Defaults to -1, means the last stage. + frozen_stages (int): Stages to be frozen (all param fixed). + Defaults to 0, which means not freezing any parameters. + gap_before_final_norm (bool): Whether to globally average the feature + map before the final norm layer. In the official repo, it's only + used in classification task. Defaults to True. + init_cfg (dict, optional): Initialization config dict + """ # noqa: E501 + arch_settings = { + 'tiny': { + 'depths': [3, 3, 9, 3], + 'channels': [96, 192, 384, 768] + }, + 'small': { + 'depths': [3, 3, 27, 3], + 'channels': [96, 192, 384, 768] + }, + 'base': { + 'depths': [3, 3, 27, 3], + 'channels': [128, 256, 512, 1024] + }, + 'large': { + 'depths': [3, 3, 27, 3], + 'channels': [192, 384, 768, 1536] + }, + 'xlarge': { + 'depths': [3, 3, 27, 3], + 'channels': [256, 512, 1024, 2048] + }, + } + + def __init__(self, + arch='tiny', + in_channels=3, + stem_patch_size=4, + norm_cfg=dict(type='LN2d', eps=1e-6), + act_cfg=dict(type='GELU'), + linear_pw_conv=True, + drop_path_rate=0., + layer_scale_init_value=1e-6, + out_indices=-1, + frozen_stages=0, + gap_before_final_norm=True, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + + if isinstance(arch, str): + assert arch in self.arch_settings, \ + f'Unavailable arch, please choose from ' \ + f'({set(self.arch_settings)}) or pass a dict.' + arch = self.arch_settings[arch] + elif isinstance(arch, dict): + assert 'depths' in arch and 'channels' in arch, \ + f'The arch dict must have "depths" and "channels", ' \ + f'but got {list(arch.keys())}.' + + self.depths = arch['depths'] + self.channels = arch['channels'] + assert (isinstance(self.depths, Sequence) + and isinstance(self.channels, Sequence) + and len(self.depths) == len(self.channels)), \ + f'The "depths" ({self.depths}) and "channels" ({self.channels}) ' \ + 'should be both sequence with the same length.' + + self.num_stages = len(self.depths) + + if isinstance(out_indices, int): + out_indices = [out_indices] + assert isinstance(out_indices, Sequence), \ + f'"out_indices" must by a sequence or int, ' \ + f'get {type(out_indices)} instead.' + for i, index in enumerate(out_indices): + if index < 0: + out_indices[i] = 4 + index + assert out_indices[i] >= 0, f'Invalid out_indices {index}' + self.out_indices = out_indices + + self.frozen_stages = frozen_stages + self.gap_before_final_norm = gap_before_final_norm + + # stochastic depth decay rule + dpr = [ + x.item() + for x in torch.linspace(0, drop_path_rate, sum(self.depths)) + ] + block_idx = 0 + + # 4 downsample layers between stages, including the stem layer. + self.downsample_layers = ModuleList() + stem = nn.Sequential( + nn.Conv2d( + in_channels, + self.channels[0], + kernel_size=stem_patch_size, + stride=stem_patch_size), + build_norm_layer(norm_cfg, self.channels[0])[1], + ) + self.downsample_layers.append(stem) + + # 4 feature resolution stages, each consisting of multiple residual + # blocks + self.stages = nn.ModuleList() + + for i in range(self.num_stages): + depth = self.depths[i] + channels = self.channels[i] + + if i >= 1: + downsample_layer = nn.Sequential( + LayerNorm2d(self.channels[i - 1]), + nn.Conv2d( + self.channels[i - 1], + channels, + kernel_size=2, + stride=2), + ) + self.downsample_layers.append(downsample_layer) + + stage = Sequential(*[ + ConvNeXtBlock( + in_channels=channels, + drop_path_rate=dpr[block_idx + j], + norm_cfg=norm_cfg, + act_cfg=act_cfg, + linear_pw_conv=linear_pw_conv, + layer_scale_init_value=layer_scale_init_value) + for j in range(depth) + ]) + block_idx += depth + + self.stages.append(stage) + + if i in self.out_indices: + norm_layer = build_norm_layer(norm_cfg, channels)[1] + self.add_module(f'norm{i}', norm_layer) + + self._freeze_stages() + + def forward(self, x): + outs = [] + for i, stage in enumerate(self.stages): + x = self.downsample_layers[i](x) + x = stage(x) + if i in self.out_indices: + norm_layer = getattr(self, f'norm{i}') + if self.gap_before_final_norm: + gap = x.mean([-2, -1], keepdim=True) + outs.append(norm_layer(gap).flatten(1)) + else: + outs.append(norm_layer(x)) + + return tuple(outs) + + def _freeze_stages(self): + for i in range(self.frozen_stages): + downsample_layer = self.downsample_layers[i] + stage = self.stages[i] + downsample_layer.eval() + stage.eval() + for param in chain(downsample_layer.parameters(), + stage.parameters()): + param.requires_grad = False + + def train(self, mode=True): + super(ConvNeXt, self).train(mode) + self._freeze_stages() diff --git a/mmcls/models/backbones/efficientnet.py b/mmcls/models/backbones/efficientnet.py new file mode 100644 index 00000000000..ede2c184e14 --- /dev/null +++ b/mmcls/models/backbones/efficientnet.py @@ -0,0 +1,407 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import math +from functools import partial + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn.bricks import ConvModule, DropPath +from mmcv.runner import BaseModule, Sequential + +from mmcls.models.backbones.base_backbone import BaseBackbone +from mmcls.models.utils import InvertedResidual, SELayer, make_divisible +from ..builder import BACKBONES + + +class EdgeResidual(BaseModule): + """Edge Residual Block. + + Args: + in_channels (int): The input channels of this module. + out_channels (int): The output channels of this module. + mid_channels (int): The input channels of the second convolution. + kernel_size (int): The kernel size of the first convolution. + Defaults to 3. + stride (int): The stride of the first convolution. Defaults to 1. + se_cfg (dict, optional): Config dict for se layer. Defaults to None, + which means no se layer. + with_residual (bool): Use residual connection. Defaults to True. + conv_cfg (dict, optional): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to ``dict(type='BN')``. + act_cfg (dict): Config dict for activation layer. + Defaults to ``dict(type='ReLU')``. + drop_path_rate (float): stochastic depth rate. Defaults to 0. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + init_cfg (dict | list[dict], optional): Initialization config dict. + """ + + def __init__(self, + in_channels, + out_channels, + mid_channels, + kernel_size=3, + stride=1, + se_cfg=None, + with_residual=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + drop_path_rate=0., + with_cp=False, + init_cfg=None): + super(EdgeResidual, self).__init__(init_cfg=init_cfg) + assert stride in [1, 2] + self.with_cp = with_cp + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate > 0 else nn.Identity() + self.with_se = se_cfg is not None + self.with_residual = ( + stride == 1 and in_channels == out_channels and with_residual) + + if self.with_se: + assert isinstance(se_cfg, dict) + + self.conv1 = ConvModule( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=1, + padding=kernel_size // 2, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + if self.with_se: + self.se = SELayer(**se_cfg) + + self.conv2 = ConvModule( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x): + + def _inner_forward(x): + out = x + out = self.conv1(out) + + if self.with_se: + out = self.se(out) + + out = self.conv2(out) + + if self.with_residual: + return x + self.drop_path(out) + else: + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +def model_scaling(layer_setting, arch_setting): + """Scaling operation to the layer's parameters according to the + arch_setting.""" + # scale width + new_layer_setting = copy.deepcopy(layer_setting) + for layer_cfg in new_layer_setting: + for block_cfg in layer_cfg: + block_cfg[1] = make_divisible(block_cfg[1] * arch_setting[0], 8) + + # scale depth + split_layer_setting = [new_layer_setting[0]] + for layer_cfg in new_layer_setting[1:-1]: + tmp_index = [0] + for i in range(len(layer_cfg) - 1): + if layer_cfg[i + 1][1] != layer_cfg[i][1]: + tmp_index.append(i + 1) + tmp_index.append(len(layer_cfg)) + for i in range(len(tmp_index) - 1): + split_layer_setting.append(layer_cfg[tmp_index[i]:tmp_index[i + + 1]]) + split_layer_setting.append(new_layer_setting[-1]) + + num_of_layers = [len(layer_cfg) for layer_cfg in split_layer_setting[1:-1]] + new_layers = [ + int(math.ceil(arch_setting[1] * num)) for num in num_of_layers + ] + + merge_layer_setting = [split_layer_setting[0]] + for i, layer_cfg in enumerate(split_layer_setting[1:-1]): + if new_layers[i] <= num_of_layers[i]: + tmp_layer_cfg = layer_cfg[:new_layers[i]] + else: + tmp_layer_cfg = copy.deepcopy(layer_cfg) + [layer_cfg[-1]] * ( + new_layers[i] - num_of_layers[i]) + if tmp_layer_cfg[0][3] == 1 and i != 0: + merge_layer_setting[-1] += tmp_layer_cfg.copy() + else: + merge_layer_setting.append(tmp_layer_cfg.copy()) + merge_layer_setting.append(split_layer_setting[-1]) + + return merge_layer_setting + + +@BACKBONES.register_module() +class EfficientNet(BaseBackbone): + """EfficientNet backbone. + + Args: + arch (str): Architecture of efficientnet. Defaults to b0. + out_indices (Sequence[int]): Output from which stages. + Defaults to (6, ). + frozen_stages (int): Stages to be frozen (all param fixed). + Defaults to 0, which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. + Defaults to None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Defaults to dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Defaults to dict(type='Swish'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + """ + + # Parameters to build layers. + # 'b' represents the architecture of normal EfficientNet family includes + # 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8'. + # 'e' represents the architecture of EfficientNet-EdgeTPU including 'es', + # 'em', 'el'. + # 6 parameters are needed to construct a layer, From left to right: + # - kernel_size: The kernel size of the block + # - out_channel: The number of out_channels of the block + # - se_ratio: The sequeeze ratio of SELayer. + # - stride: The stride of the block + # - expand_ratio: The expand_ratio of the mid_channels + # - block_type: -1: Not a block, 0: InvertedResidual, 1: EdgeResidual + layer_settings = { + 'b': [[[3, 32, 0, 2, 0, -1]], + [[3, 16, 4, 1, 1, 0]], + [[3, 24, 4, 2, 6, 0], + [3, 24, 4, 1, 6, 0]], + [[5, 40, 4, 2, 6, 0], + [5, 40, 4, 1, 6, 0]], + [[3, 80, 4, 2, 6, 0], + [3, 80, 4, 1, 6, 0], + [3, 80, 4, 1, 6, 0], + [5, 112, 4, 1, 6, 0], + [5, 112, 4, 1, 6, 0], + [5, 112, 4, 1, 6, 0]], + [[5, 192, 4, 2, 6, 0], + [5, 192, 4, 1, 6, 0], + [5, 192, 4, 1, 6, 0], + [5, 192, 4, 1, 6, 0], + [3, 320, 4, 1, 6, 0]], + [[1, 1280, 0, 1, 0, -1]] + ], + 'e': [[[3, 32, 0, 2, 0, -1]], + [[3, 24, 0, 1, 3, 1]], + [[3, 32, 0, 2, 8, 1], + [3, 32, 0, 1, 8, 1]], + [[3, 48, 0, 2, 8, 1], + [3, 48, 0, 1, 8, 1], + [3, 48, 0, 1, 8, 1], + [3, 48, 0, 1, 8, 1]], + [[5, 96, 0, 2, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 96, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0], + [5, 144, 0, 1, 8, 0]], + [[5, 192, 0, 2, 8, 0], + [5, 192, 0, 1, 8, 0]], + [[1, 1280, 0, 1, 0, -1]] + ] + } # yapf: disable + + # Parameters to build different kinds of architecture. + # From left to right: scaling factor for width, scaling factor for depth, + # resolution. + arch_settings = { + 'b0': (1.0, 1.0, 224), + 'b1': (1.0, 1.1, 240), + 'b2': (1.1, 1.2, 260), + 'b3': (1.2, 1.4, 300), + 'b4': (1.4, 1.8, 380), + 'b5': (1.6, 2.2, 456), + 'b6': (1.8, 2.6, 528), + 'b7': (2.0, 3.1, 600), + 'b8': (2.2, 3.6, 672), + 'es': (1.0, 1.0, 224), + 'em': (1.0, 1.1, 240), + 'el': (1.2, 1.4, 300) + } + + def __init__(self, + arch='b0', + drop_path_rate=0., + out_indices=(6, ), + frozen_stages=0, + conv_cfg=dict(type='Conv2dAdaptivePadding'), + norm_cfg=dict(type='BN', eps=1e-3), + act_cfg=dict(type='Swish'), + norm_eval=False, + with_cp=False, + init_cfg=[ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + layer=['_BatchNorm', 'GroupNorm'], + val=1) + ]): + super(EfficientNet, self).__init__(init_cfg) + assert arch in self.arch_settings, \ + f'"{arch}" is not one of the arch_settings ' \ + f'({", ".join(self.arch_settings.keys())})' + self.arch_setting = self.arch_settings[arch] + self.layer_setting = self.layer_settings[arch[:1]] + for index in out_indices: + if index not in range(0, len(self.layer_setting)): + raise ValueError('the item in out_indices must in ' + f'range(0, {len(self.layer_setting)}). ' + f'But received {index}') + + if frozen_stages not in range(len(self.layer_setting) + 1): + raise ValueError('frozen_stages must be in range(0, ' + f'{len(self.layer_setting) + 1}). ' + f'But received {frozen_stages}') + self.drop_path_rate = drop_path_rate + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.layer_setting = model_scaling(self.layer_setting, + self.arch_setting) + block_cfg_0 = self.layer_setting[0][0] + block_cfg_last = self.layer_setting[-1][0] + self.in_channels = make_divisible(block_cfg_0[1], 8) + self.out_channels = block_cfg_last[1] + self.layers = nn.ModuleList() + self.layers.append( + ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=block_cfg_0[0], + stride=block_cfg_0[3], + padding=block_cfg_0[0] // 2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.make_layer() + self.layers.append( + ConvModule( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=block_cfg_last[0], + stride=block_cfg_last[3], + padding=block_cfg_last[0] // 2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def make_layer(self): + # Without the first and the final conv block. + layer_setting = self.layer_setting[1:-1] + + total_num_blocks = sum([len(x) for x in layer_setting]) + block_idx = 0 + dpr = [ + x.item() + for x in torch.linspace(0, self.drop_path_rate, total_num_blocks) + ] # stochastic depth decay rule + + for layer_cfg in layer_setting: + layer = [] + for i, block_cfg in enumerate(layer_cfg): + (kernel_size, out_channels, se_ratio, stride, expand_ratio, + block_type) = block_cfg + + mid_channels = int(self.in_channels * expand_ratio) + out_channels = make_divisible(out_channels, 8) + if se_ratio <= 0: + se_cfg = None + else: + se_cfg = dict( + channels=mid_channels, + ratio=expand_ratio * se_ratio, + divisor=1, + act_cfg=(self.act_cfg, dict(type='Sigmoid'))) + if block_type == 1: # edge tpu + if i > 0 and expand_ratio == 3: + with_residual = False + expand_ratio = 4 + else: + with_residual = True + mid_channels = int(self.in_channels * expand_ratio) + if se_cfg is not None: + se_cfg = dict( + channels=mid_channels, + ratio=se_ratio * expand_ratio, + divisor=1, + act_cfg=(self.act_cfg, dict(type='Sigmoid'))) + block = partial(EdgeResidual, with_residual=with_residual) + else: + block = InvertedResidual + layer.append( + block( + in_channels=self.in_channels, + out_channels=out_channels, + mid_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + se_cfg=se_cfg, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + drop_path_rate=dpr[block_idx], + with_cp=self.with_cp)) + self.in_channels = out_channels + block_idx += 1 + self.layers.append(Sequential(*layer)) + + def forward(self, x): + outs = [] + for i, layer in enumerate(self.layers): + x = layer(x) + if i in self.out_indices: + outs.append(x) + + return tuple(outs) + + def _freeze_stages(self): + for i in range(self.frozen_stages): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(EfficientNet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() diff --git a/mmcls/models/backbones/hrnet.py b/mmcls/models/backbones/hrnet.py new file mode 100644 index 00000000000..57baf0cae74 --- /dev/null +++ b/mmcls/models/backbones/hrnet.py @@ -0,0 +1,563 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmcv.runner import BaseModule, ModuleList, Sequential +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from .resnet import BasicBlock, Bottleneck, ResLayer, get_expansion + + +class HRModule(BaseModule): + """High-Resolution Module for HRNet. + + In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange + is in this module. + + Args: + num_branches (int): The number of branches. + block (``BaseModule``): Convolution block module. + num_blocks (tuple): The number of blocks in each branch. + The length must be equal to ``num_branches``. + num_channels (tuple): The number of base channels in each branch. + The length must be equal to ``num_branches``. + multiscale_output (bool): Whether to output multi-level features + produced by multiple branches. If False, only the first level + feature will be output. Defaults to True. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + conv_cfg (dict, optional): Dictionary to construct and config conv + layer. Defaults to None. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to ``dict(type='BN')``. + block_init_cfg (dict, optional): The initialization configs of every + blocks. Defaults to None. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + """ + + def __init__(self, + num_branches, + block, + num_blocks, + in_channels, + num_channels, + multiscale_output=True, + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + block_init_cfg=None, + init_cfg=None): + super(HRModule, self).__init__(init_cfg) + self.block_init_cfg = block_init_cfg + self._check_branches(num_branches, num_blocks, in_channels, + num_channels) + + self.in_channels = in_channels + self.num_branches = num_branches + + self.multiscale_output = multiscale_output + self.norm_cfg = norm_cfg + self.conv_cfg = conv_cfg + self.with_cp = with_cp + self.branches = self._make_branches(num_branches, block, num_blocks, + num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(inplace=False) + + def _check_branches(self, num_branches, num_blocks, in_channels, + num_channels): + if num_branches != len(num_blocks): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_BLOCKS({len(num_blocks)})' + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_CHANNELS({len(num_channels)})' + raise ValueError(error_msg) + + if num_branches != len(in_channels): + error_msg = f'NUM_BRANCHES({num_branches}) ' \ + f'!= NUM_INCHANNELS({len(in_channels)})' + raise ValueError(error_msg) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + branches = [] + + for i in range(num_branches): + out_channels = num_channels[i] * get_expansion(block) + branches.append( + ResLayer( + block=block, + num_blocks=num_blocks[i], + in_channels=self.in_channels[i], + out_channels=out_channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + with_cp=self.with_cp, + init_cfg=self.block_init_cfg, + )) + + return ModuleList(branches) + + def _make_fuse_layers(self): + if self.num_branches == 1: + return None + + num_branches = self.num_branches + in_channels = self.in_channels + fuse_layers = [] + num_out_branches = num_branches if self.multiscale_output else 1 + for i in range(num_out_branches): + fuse_layer = [] + for j in range(num_branches): + if j > i: + # Upsample the feature maps of smaller scales. + fuse_layer.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=1, + stride=1, + padding=0, + bias=False), + build_norm_layer(self.norm_cfg, in_channels[i])[1], + nn.Upsample( + scale_factor=2**(j - i), mode='nearest'))) + elif j == i: + # Keep the feature map with the same scale. + fuse_layer.append(None) + else: + # Downsample the feature maps of larger scales. + conv_downsamples = [] + for k in range(i - j): + # Use stacked convolution layers to downsample. + if k == i - j - 1: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[i])[1])) + else: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[j], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[j])[1], + nn.ReLU(inplace=False))) + fuse_layer.append(nn.Sequential(*conv_downsamples)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def forward(self, x): + """Forward function.""" + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + for i in range(len(self.fuse_layers)): + y = 0 + for j in range(self.num_branches): + if i == j: + y += x[j] + else: + y += self.fuse_layers[i][j](x[j]) + x_fuse.append(self.relu(y)) + return x_fuse + + +@BACKBONES.register_module() +class HRNet(BaseModule): + """HRNet backbone. + + `High-Resolution Representations for Labeling Pixels and Regions + `_. + + Args: + arch (str): The preset HRNet architecture, includes 'w18', 'w30', + 'w32', 'w40', 'w44', 'w48', 'w64'. It will only be used if + extra is ``None``. Defaults to 'w32'. + extra (dict, optional): Detailed configuration for each stage of HRNet. + There must be 4 stages, the configuration for each stage must have + 5 keys: + + - num_modules (int): The number of HRModule in this stage. + - num_branches (int): The number of branches in the HRModule. + - block (str): The type of convolution block. Please choose between + 'BOTTLENECK' and 'BASIC'. + - num_blocks (tuple): The number of blocks in each branch. + The length must be equal to num_branches. + - num_channels (tuple): The number of base channels in each branch. + The length must be equal to num_branches. + + Defaults to None. + in_channels (int): Number of input image channels. Defaults to 3. + conv_cfg (dict, optional): Dictionary to construct and config conv + layer. Defaults to None. + norm_cfg (dict): Dictionary to construct and config norm layer. + Defaults to ``dict(type='BN')``. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Defaults to False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Defaults to False. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. Defaults to False. + multiscale_output (bool): Whether to output multi-level features + produced by multiple branches. If False, only the first level + feature will be output. Defaults to True. + init_cfg (dict or list[dict], optional): Initialization config dict. + Defaults to None. + + Example: + >>> import torch + >>> from mmcls.models import HRNet + >>> extra = dict( + >>> stage1=dict( + >>> num_modules=1, + >>> num_branches=1, + >>> block='BOTTLENECK', + >>> num_blocks=(4, ), + >>> num_channels=(64, )), + >>> stage2=dict( + >>> num_modules=1, + >>> num_branches=2, + >>> block='BASIC', + >>> num_blocks=(4, 4), + >>> num_channels=(32, 64)), + >>> stage3=dict( + >>> num_modules=4, + >>> num_branches=3, + >>> block='BASIC', + >>> num_blocks=(4, 4, 4), + >>> num_channels=(32, 64, 128)), + >>> stage4=dict( + >>> num_modules=3, + >>> num_branches=4, + >>> block='BASIC', + >>> num_blocks=(4, 4, 4, 4), + >>> num_channels=(32, 64, 128, 256))) + >>> self = HRNet(extra, in_channels=1) + >>> self.eval() + >>> inputs = torch.rand(1, 1, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 32, 8, 8) + (1, 64, 4, 4) + (1, 128, 2, 2) + (1, 256, 1, 1) + """ + + blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck} + arch_zoo = { + # num_modules, num_branches, block, num_blocks, num_channels + 'w18': [[1, 1, 'BOTTLENECK', (4, ), (64, )], + [1, 2, 'BASIC', (4, 4), (18, 36)], + [4, 3, 'BASIC', (4, 4, 4), (18, 36, 72)], + [3, 4, 'BASIC', (4, 4, 4, 4), (18, 36, 72, 144)]], + 'w30': [[1, 1, 'BOTTLENECK', (4, ), (64, )], + [1, 2, 'BASIC', (4, 4), (30, 60)], + [4, 3, 'BASIC', (4, 4, 4), (30, 60, 120)], + [3, 4, 'BASIC', (4, 4, 4, 4), (30, 60, 120, 240)]], + 'w32': [[1, 1, 'BOTTLENECK', (4, ), (64, )], + [1, 2, 'BASIC', (4, 4), (32, 64)], + [4, 3, 'BASIC', (4, 4, 4), (32, 64, 128)], + [3, 4, 'BASIC', (4, 4, 4, 4), (32, 64, 128, 256)]], + 'w40': [[1, 1, 'BOTTLENECK', (4, ), (64, )], + [1, 2, 'BASIC', (4, 4), (40, 80)], + [4, 3, 'BASIC', (4, 4, 4), (40, 80, 160)], + [3, 4, 'BASIC', (4, 4, 4, 4), (40, 80, 160, 320)]], + 'w44': [[1, 1, 'BOTTLENECK', (4, ), (64, )], + [1, 2, 'BASIC', (4, 4), (44, 88)], + [4, 3, 'BASIC', (4, 4, 4), (44, 88, 176)], + [3, 4, 'BASIC', (4, 4, 4, 4), (44, 88, 176, 352)]], + 'w48': [[1, 1, 'BOTTLENECK', (4, ), (64, )], + [1, 2, 'BASIC', (4, 4), (48, 96)], + [4, 3, 'BASIC', (4, 4, 4), (48, 96, 192)], + [3, 4, 'BASIC', (4, 4, 4, 4), (48, 96, 192, 384)]], + 'w64': [[1, 1, 'BOTTLENECK', (4, ), (64, )], + [1, 2, 'BASIC', (4, 4), (64, 128)], + [4, 3, 'BASIC', (4, 4, 4), (64, 128, 256)], + [3, 4, 'BASIC', (4, 4, 4, 4), (64, 128, 256, 512)]], + } # yapf:disable + + def __init__(self, + arch='w32', + extra=None, + in_channels=3, + conv_cfg=None, + norm_cfg=dict(type='BN'), + norm_eval=False, + with_cp=False, + zero_init_residual=False, + multiscale_output=True, + init_cfg=[ + dict(type='Kaiming', layer='Conv2d'), + dict( + type='Constant', + val=1, + layer=['_BatchNorm', 'GroupNorm']) + ]): + super(HRNet, self).__init__(init_cfg) + + extra = self.parse_arch(arch, extra) + + # Assert configurations of 4 stages are in extra + for i in range(1, 5): + assert f'stage{i}' in extra, f'Missing stage{i} config in "extra".' + # Assert whether the length of `num_blocks` and `num_channels` are + # equal to `num_branches` + cfg = extra[f'stage{i}'] + assert len(cfg['num_blocks']) == cfg['num_branches'] and \ + len(cfg['num_channels']) == cfg['num_branches'] + + self.extra = extra + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + self.zero_init_residual = zero_init_residual + + # -------------------- stem net -------------------- + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + out_channels=64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) + self.add_module(self.norm1_name, norm1) + + self.conv2 = build_conv_layer( + self.conv_cfg, + in_channels=64, + out_channels=64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2) + self.add_module(self.norm2_name, norm2) + self.relu = nn.ReLU(inplace=True) + + # -------------------- stage 1 -------------------- + self.stage1_cfg = self.extra['stage1'] + base_channels = self.stage1_cfg['num_channels'] + block_type = self.stage1_cfg['block'] + num_blocks = self.stage1_cfg['num_blocks'] + + block = self.blocks_dict[block_type] + num_channels = [ + channel * get_expansion(block) for channel in base_channels + ] + # To align with the original code, use layer1 instead of stage1 here. + self.layer1 = ResLayer( + block, + in_channels=64, + out_channels=num_channels[0], + num_blocks=num_blocks[0]) + pre_num_channels = num_channels + + # -------------------- stage 2~4 -------------------- + for i in range(2, 5): + stage_cfg = self.extra[f'stage{i}'] + base_channels = stage_cfg['num_channels'] + block = self.blocks_dict[stage_cfg['block']] + multiscale_output_ = multiscale_output if i == 4 else True + + num_channels = [ + channel * get_expansion(block) for channel in base_channels + ] + # The transition layer from layer1 to stage2 + transition = self._make_transition_layer(pre_num_channels, + num_channels) + self.add_module(f'transition{i-1}', transition) + stage = self._make_stage( + stage_cfg, num_channels, multiscale_output=multiscale_output_) + self.add_module(f'stage{i}', stage) + + pre_num_channels = num_channels + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: the normalization layer named "norm2" """ + return getattr(self, self.norm2_name) + + def _make_transition_layer(self, num_channels_pre_layer, + num_channels_cur_layer): + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + # For existing scale branches, + # add conv block when the channels are not the same. + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + num_channels_pre_layer[i], + num_channels_cur_layer[i], + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + num_channels_cur_layer[i])[1], + nn.ReLU(inplace=True))) + else: + transition_layers.append(nn.Identity()) + else: + # For new scale branches, add stacked downsample conv blocks. + # For example, num_branches_pre = 2, for the 4th branch, add + # stacked two downsample conv blocks. + conv_downsamples = [] + for j in range(i + 1 - num_branches_pre): + in_channels = num_channels_pre_layer[-1] + out_channels = num_channels_cur_layer[i] \ + if j == i - num_branches_pre else in_channels + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, out_channels)[1], + nn.ReLU(inplace=True))) + transition_layers.append(nn.Sequential(*conv_downsamples)) + + return nn.ModuleList(transition_layers) + + def _make_stage(self, layer_config, in_channels, multiscale_output=True): + num_modules = layer_config['num_modules'] + num_branches = layer_config['num_branches'] + num_blocks = layer_config['num_blocks'] + num_channels = layer_config['num_channels'] + block = self.blocks_dict[layer_config['block']] + + hr_modules = [] + block_init_cfg = None + if self.zero_init_residual: + if block is BasicBlock: + block_init_cfg = dict( + type='Constant', val=0, override=dict(name='norm2')) + elif block is Bottleneck: + block_init_cfg = dict( + type='Constant', val=0, override=dict(name='norm3')) + + for i in range(num_modules): + # multi_scale_output is only used for the last module + if not multiscale_output and i == num_modules - 1: + reset_multiscale_output = False + else: + reset_multiscale_output = True + + hr_modules.append( + HRModule( + num_branches, + block, + num_blocks, + in_channels, + num_channels, + reset_multiscale_output, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg, + block_init_cfg=block_init_cfg)) + + return Sequential(*hr_modules) + + def forward(self, x): + """Forward function.""" + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.norm2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [x] + + for i in range(2, 5): + # Apply transition + transition = getattr(self, f'transition{i-1}') + inputs = [] + for j, layer in enumerate(transition): + if j < len(x_list): + inputs.append(layer(x_list[j])) + else: + inputs.append(layer(x_list[-1])) + # Forward HRModule + stage = getattr(self, f'stage{i}') + x_list = stage(inputs) + + return tuple(x_list) + + def train(self, mode=True): + """Convert the model into training mode will keeping the normalization + layer freezed.""" + super(HRNet, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() + + def parse_arch(self, arch, extra=None): + if extra is not None: + return extra + + assert arch in self.arch_zoo, \ + ('Invalid arch, please choose arch from ' + f'{list(self.arch_zoo.keys())}, or specify `extra` ' + 'argument directly.') + + extra = dict() + for i, stage_setting in enumerate(self.arch_zoo[arch], start=1): + extra[f'stage{i}'] = dict( + num_modules=stage_setting[0], + num_branches=stage_setting[1], + block=stage_setting[2], + num_blocks=stage_setting[3], + num_channels=stage_setting[4], + ) + + return extra diff --git a/mmcls/models/backbones/resnet.py b/mmcls/models/backbones/resnet.py index 2235657fb4b..efb0e2b6ae2 100644 --- a/mmcls/models/backbones/resnet.py +++ b/mmcls/models/backbones/resnet.py @@ -5,6 +5,7 @@ from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer, constant_init) from mmcv.cnn.bricks import DropPath +from mmcv.runner import BaseModule from mmcv.utils.parrots_wrapper import _BatchNorm from ..builder import BACKBONES @@ -13,7 +14,7 @@ eps = 1.0e-5 -class BasicBlock(nn.Module): +class BasicBlock(BaseModule): """BasicBlock for ResNet. Args: @@ -47,8 +48,9 @@ def __init__(self, with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), - drop_path_rate=0.0): - super(BasicBlock, self).__init__() + drop_path_rate=0.0, + init_cfg=None): + super(BasicBlock, self).__init__(init_cfg=init_cfg) self.in_channels = in_channels self.out_channels = out_channels self.expansion = expansion @@ -130,7 +132,7 @@ def _inner_forward(x): return out -class Bottleneck(nn.Module): +class Bottleneck(BaseModule): """Bottleneck block for ResNet. Args: @@ -164,8 +166,9 @@ def __init__(self, with_cp=False, conv_cfg=None, norm_cfg=dict(type='BN'), - drop_path_rate=0.0): - super(Bottleneck, self).__init__() + drop_path_rate=0.0, + init_cfg=None): + super(Bottleneck, self).__init__(init_cfg=init_cfg) assert style in ['pytorch', 'caffe'] self.in_channels = in_channels diff --git a/mmcls/models/backbones/shufflenet_v2.py b/mmcls/models/backbones/shufflenet_v2.py index 77a16e00346..bfe7ac8282a 100644 --- a/mmcls/models/backbones/shufflenet_v2.py +++ b/mmcls/models/backbones/shufflenet_v2.py @@ -115,7 +115,14 @@ def _inner_forward(x): if self.stride > 1: out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) else: - x1, x2 = x.chunk(2, dim=1) + # Channel Split operation. using these lines of code to replace + # ``chunk(x, 2, dim=1)`` can make it easier to deploy a + # shufflenetv2 model by using mmdeploy. + channels = x.shape[1] + c = channels // 2 + channels % 2 + x1 = x[:, :c, :, :] + x2 = x[:, c:, :, :] + out = torch.cat((x1, self.branch2(x2)), dim=1) out = channel_shuffle(out, 2) diff --git a/mmcls/models/backbones/timm_backbone.py b/mmcls/models/backbones/timm_backbone.py index 2e88d6057a0..1506619a93a 100644 --- a/mmcls/models/backbones/timm_backbone.py +++ b/mmcls/models/backbones/timm_backbone.py @@ -4,52 +4,109 @@ except ImportError: timm = None +import warnings + +from mmcv.cnn.bricks.registry import NORM_LAYERS + +from ...utils import get_root_logger from ..builder import BACKBONES from .base_backbone import BaseBackbone +def print_timm_feature_info(feature_info): + """Print feature_info of timm backbone to help development and debug. + + Args: + feature_info (list[dict] | timm.models.features.FeatureInfo | None): + feature_info of timm backbone. + """ + logger = get_root_logger() + if feature_info is None: + logger.warning('This backbone does not have feature_info') + elif isinstance(feature_info, list): + for feat_idx, each_info in enumerate(feature_info): + logger.info(f'backbone feature_info[{feat_idx}]: {each_info}') + else: + try: + logger.info(f'backbone out_indices: {feature_info.out_indices}') + logger.info(f'backbone out_channels: {feature_info.channels()}') + logger.info(f'backbone out_strides: {feature_info.reduction()}') + except AttributeError: + logger.warning('Unexpected format of backbone feature_info') + + @BACKBONES.register_module() class TIMMBackbone(BaseBackbone): - """Wrapper to use backbones from timm library. More details can be found in - `timm `_ . + """Wrapper to use backbones from timm library. + + More details can be found in + `timm `_. + See especially the document for `feature extraction + `_. Args: model_name (str): Name of timm model to instantiate. - pretrained (bool): Load pretrained weights if True. - checkpoint_path (str): Path of checkpoint to load after - model is initialized. - in_channels (int): Number of input image channels. Default: 3. - init_cfg (dict, optional): Initialization config dict + features_only (bool): Whether to extract feature pyramid (multi-scale + feature maps from the deepest layer at each stride). For Vision + Transformer models that do not support this argument, + set this False. Defaults to False. + pretrained (bool): Whether to load pretrained weights. + Defaults to False. + checkpoint_path (str): Path of checkpoint to load at the last of + ``timm.create_model``. Defaults to empty string, which means + not loading. + in_channels (int): Number of input image channels. Defaults to 3. + init_cfg (dict or list[dict], optional): Initialization config dict of + OpenMMLab projects. Defaults to None. **kwargs: Other timm & model specific arguments. """ - def __init__( - self, - model_name, - pretrained=False, - checkpoint_path='', - in_channels=3, - init_cfg=None, - **kwargs, - ): + def __init__(self, + model_name, + features_only=False, + pretrained=False, + checkpoint_path='', + in_channels=3, + init_cfg=None, + **kwargs): if timm is None: - raise RuntimeError('timm is not installed') + raise RuntimeError( + 'Failed to import timm. Please run "pip install timm". ' + '"pip install dataclasses" may also be needed for Python 3.6.') + if not isinstance(pretrained, bool): + raise TypeError('pretrained must be bool, not str for model path') + if features_only and checkpoint_path: + warnings.warn( + 'Using both features_only and checkpoint_path will cause error' + ' in timm. See ' + 'https://github.com/rwightman/pytorch-image-models/issues/488') + super(TIMMBackbone, self).__init__(init_cfg) + if 'norm_layer' in kwargs: + kwargs['norm_layer'] = NORM_LAYERS.get(kwargs['norm_layer']) self.timm_model = timm.create_model( model_name=model_name, + features_only=features_only, pretrained=pretrained, in_chans=in_channels, checkpoint_path=checkpoint_path, - **kwargs, - ) + **kwargs) # reset classifier - self.timm_model.reset_classifier(0, '') + if hasattr(self.timm_model, 'reset_classifier'): + self.timm_model.reset_classifier(0, '') # Hack to use pretrained weights from timm if pretrained or checkpoint_path: self._is_init = True + feature_info = getattr(self.timm_model, 'feature_info', None) + print_timm_feature_info(feature_info) + def forward(self, x): - features = self.timm_model.forward_features(x) - return (features, ) + features = self.timm_model(x) + if isinstance(features, (list, tuple)): + features = tuple(features) + else: + features = (features, ) + return features diff --git a/mmcls/models/backbones/twins.py b/mmcls/models/backbones/twins.py new file mode 100644 index 00000000000..0e3c47a4992 --- /dev/null +++ b/mmcls/models/backbones/twins.py @@ -0,0 +1,723 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Conv2d, build_norm_layer +from mmcv.cnn.bricks.drop import build_dropout +from mmcv.cnn.bricks.transformer import FFN, PatchEmbed +from mmcv.cnn.utils.weight_init import (constant_init, normal_init, + trunc_normal_init) +from mmcv.runner import BaseModule, ModuleList +from torch.nn.modules.batchnorm import _BatchNorm + +from mmcls.models.builder import BACKBONES +from mmcls.models.utils.attention import MultiheadAttention +from mmcls.models.utils.position_encoding import ConditionalPositionEncoding + + +class GlobalSubsampledAttention(MultiheadAttention): + """Global Sub-sampled Attention (GSA) module. + + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. + input_dims (int, optional): The input dimension, and if None, + use ``embed_dims``. Defaults to None. + attn_drop (float): Dropout rate of the dropout layer after the + attention calculation of query and key. Defaults to 0. + proj_drop (float): Dropout rate of the dropout layer after the + output projection. Defaults to 0. + dropout_layer (dict): The dropout config before adding the shortcut. + Defaults to ``dict(type='Dropout', drop_prob=0.)``. + qkv_bias (bool): If True, add a learnable bias to q, k, v. + Defaults to True. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + qk_scale (float, optional): Override default qk scale of + ``head_dim ** -0.5`` if set. Defaults to None. + proj_bias (bool) If True, add a learnable bias to output projection. + Defaults to True. + v_shortcut (bool): Add a shortcut from value to output. It's usually + used if ``input_dims`` is different from ``embed_dims``. + Defaults to False. + sr_ratio (float): The ratio of spatial reduction in attention modules. + Defaults to 1. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims, + num_heads, + norm_cfg=dict(type='LN'), + qkv_bias=True, + sr_ratio=1, + **kwargs): + super(GlobalSubsampledAttention, + self).__init__(embed_dims, num_heads, **kwargs) + + self.qkv_bias = qkv_bias + self.q = nn.Linear(self.input_dims, embed_dims, bias=qkv_bias) + self.kv = nn.Linear(self.input_dims, embed_dims * 2, bias=qkv_bias) + + # remove self.qkv, here split into self.q, self.kv + delattr(self, 'qkv') + + self.sr_ratio = sr_ratio + if sr_ratio > 1: + # use a conv as the spatial-reduction operation, the kernel_size + # and stride in conv are equal to the sr_ratio. + self.sr = Conv2d( + in_channels=embed_dims, + out_channels=embed_dims, + kernel_size=sr_ratio, + stride=sr_ratio) + # The ret[0] of build_norm_layer is norm name. + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + + def forward(self, x, hw_shape): + B, N, C = x.shape + H, W = hw_shape + assert H * W == N, 'The product of h and w of hw_shape must be N, ' \ + 'which is the 2nd dim number of the input Tensor x.' + + q = self.q(x).reshape(B, N, self.num_heads, + C // self.num_heads).permute(0, 2, 1, 3) + + if self.sr_ratio > 1: + x = x.permute(0, 2, 1).reshape(B, C, *hw_shape) # BNC_2_BCHW + x = self.sr(x) + x = x.reshape(B, C, -1).permute(0, 2, 1) # BCHW_2_BNC + x = self.norm(x) + + kv = self.kv(x).reshape(B, -1, 2, self.num_heads, + self.head_dims).permute(2, 0, 3, 1, 4) + k, v = kv[0], kv[1] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.out_drop(self.proj_drop(x)) + + if self.v_shortcut: + x = v.squeeze(1) + x + return x + + +class GSAEncoderLayer(BaseModule): + """Implements one encoder layer with GlobalSubsampledAttention(GSA). + + Args: + embed_dims (int): The feature dimension. + num_heads (int): Parallel attention heads. + feedforward_channels (int): The hidden dimension for FFNs. + drop_rate (float): Probability of an element to be zeroed + after the feed forward layer. Default: 0.0. + attn_drop_rate (float): The drop out rate for attention layer. + Default: 0.0. + drop_path_rate (float): Stochastic depth rate. Default 0.0. + num_fcs (int): The number of fully-connected layers for FFNs. + Default: 2. + qkv_bias (bool): Enable bias for qkv if True. Default: True + act_cfg (dict): The activation config for FFNs. + Default: dict(type='GELU'). + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + sr_ratio (float): The ratio of spatial reduction in attention modules. + Defaults to 1. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims, + num_heads, + feedforward_channels, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + num_fcs=2, + qkv_bias=True, + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN'), + sr_ratio=1., + init_cfg=None): + super(GSAEncoderLayer, self).__init__(init_cfg=init_cfg) + + self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1] + self.attn = GlobalSubsampledAttention( + embed_dims=embed_dims, + num_heads=num_heads, + attn_drop=attn_drop_rate, + proj_drop=drop_rate, + dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), + qkv_bias=qkv_bias, + norm_cfg=norm_cfg, + sr_ratio=sr_ratio) + + self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1] + self.ffn = FFN( + embed_dims=embed_dims, + feedforward_channels=feedforward_channels, + num_fcs=num_fcs, + ffn_drop=drop_rate, + dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), + act_cfg=act_cfg, + add_identity=False) + + self.drop_path = build_dropout( + dict(type='DropPath', drop_prob=drop_path_rate) + ) if drop_path_rate > 0. else nn.Identity() + + def forward(self, x, hw_shape): + x = x + self.drop_path(self.attn(self.norm1(x), hw_shape)) + x = x + self.drop_path(self.ffn(self.norm2(x))) + return x + + +class LocallyGroupedSelfAttention(BaseModule): + """Locally-grouped Self Attention (LSA) module. + + Args: + embed_dims (int): Number of input channels. + num_heads (int): Number of attention heads. Default: 8 + qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. + Default: False. + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + attn_drop_rate (float, optional): Dropout ratio of attention weight. + Default: 0.0 + proj_drop_rate (float, optional): Dropout ratio of output. Default: 0. + window_size(int): Window size of LSA. Default: 1. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop_rate=0., + proj_drop_rate=0., + window_size=1, + init_cfg=None): + super(LocallyGroupedSelfAttention, self).__init__(init_cfg=init_cfg) + + assert embed_dims % num_heads == 0, \ + f'dim {embed_dims} should be divided by num_heads {num_heads}' + + self.embed_dims = embed_dims + self.num_heads = num_heads + head_dim = embed_dims // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop_rate) + self.proj = nn.Linear(embed_dims, embed_dims) + self.proj_drop = nn.Dropout(proj_drop_rate) + self.window_size = window_size + + def forward(self, x, hw_shape): + B, N, C = x.shape + H, W = hw_shape + x = x.view(B, H, W, C) + + # pad feature maps to multiples of Local-groups + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + + # calculate attention mask for LSA + Hp, Wp = x.shape[1:-1] + _h, _w = Hp // self.window_size, Wp // self.window_size + mask = torch.zeros((1, Hp, Wp), device=x.device) + mask[:, -pad_b:, :].fill_(1) + mask[:, :, -pad_r:].fill_(1) + + # [B, _h, _w, window_size, window_size, C] + x = x.reshape(B, _h, self.window_size, _w, self.window_size, + C).transpose(2, 3) + mask = mask.reshape(1, _h, self.window_size, _w, + self.window_size).transpose(2, 3).reshape( + 1, _h * _w, + self.window_size * self.window_size) + # [1, _h*_w, window_size*window_size, window_size*window_size] + attn_mask = mask.unsqueeze(2) - mask.unsqueeze(3) + attn_mask = attn_mask.masked_fill(attn_mask != 0, + float(-1000.0)).masked_fill( + attn_mask == 0, float(0.0)) + + # [3, B, _w*_h, nhead, window_size*window_size, dim] + qkv = self.qkv(x).reshape(B, _h * _w, + self.window_size * self.window_size, 3, + self.num_heads, C // self.num_heads).permute( + 3, 0, 1, 4, 2, 5) + q, k, v = qkv[0], qkv[1], qkv[2] + # [B, _h*_w, n_head, window_size*window_size, window_size*window_size] + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn + attn_mask.unsqueeze(2) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + attn = (attn @ v).transpose(2, 3).reshape(B, _h, _w, self.window_size, + self.window_size, C) + x = attn.transpose(2, 3).reshape(B, _h * self.window_size, + _w * self.window_size, C) + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + + x = x.reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class LSAEncoderLayer(BaseModule): + """Implements one encoder layer with LocallyGroupedSelfAttention(LSA). + + Args: + embed_dims (int): The feature dimension. + num_heads (int): Parallel attention heads. + feedforward_channels (int): The hidden dimension for FFNs. + drop_rate (float): Probability of an element to be zeroed + after the feed forward layer. Default: 0.0. + attn_drop_rate (float, optional): Dropout ratio of attention weight. + Default: 0.0 + drop_path_rate (float): Stochastic depth rate. Default 0.0. + num_fcs (int): The number of fully-connected layers for FFNs. + Default: 2. + qkv_bias (bool): Enable bias for qkv if True. Default: True + qk_scale (float | None, optional): Override default qk scale of + head_dim ** -0.5 if set. Default: None. + act_cfg (dict): The activation config for FFNs. + Default: dict(type='GELU'). + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + window_size (int): Window size of LSA. Default: 1. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + """ + + def __init__(self, + embed_dims, + num_heads, + feedforward_channels, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + num_fcs=2, + qkv_bias=True, + qk_scale=None, + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN'), + window_size=1, + init_cfg=None): + + super(LSAEncoderLayer, self).__init__(init_cfg=init_cfg) + + self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1] + self.attn = LocallyGroupedSelfAttention(embed_dims, num_heads, + qkv_bias, qk_scale, + attn_drop_rate, drop_rate, + window_size) + + self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1] + self.ffn = FFN( + embed_dims=embed_dims, + feedforward_channels=feedforward_channels, + num_fcs=num_fcs, + ffn_drop=drop_rate, + dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), + act_cfg=act_cfg, + add_identity=False) + + self.drop_path = build_dropout( + dict(type='DropPath', drop_prob=drop_path_rate) + ) if drop_path_rate > 0. else nn.Identity() + + def forward(self, x, hw_shape): + x = x + self.drop_path(self.attn(self.norm1(x), hw_shape)) + x = x + self.drop_path(self.ffn(self.norm2(x))) + return x + + +@BACKBONES.register_module() +class PCPVT(BaseModule): + """The backbone of Twins-PCPVT. + + This backbone is the implementation of `Twins: Revisiting the Design + of Spatial Attention in Vision Transformers + `_. + + Args: + arch (dict, str): PCPVT architecture, a str value in arch zoo or a + detailed configuration dict with 7 keys, and the length of all the + values in dict should be the same: + + - depths (List[int]): The number of encoder layers in each stage. + - embed_dims (List[int]): Embedding dimension in each stage. + - patch_sizes (List[int]): The patch sizes in each stage. + - num_heads (List[int]): Numbers of attention head in each stage. + - strides (List[int]): The strides in each stage. + - mlp_ratios (List[int]): The ratios of mlp in each stage. + - sr_ratios (List[int]): The ratios of GSA-encoder layers in each + stage. + + in_channels (int): Number of input channels. Default: 3. + out_indices (tuple[int]): Output from which stages. + Default: (3, ). + qkv_bias (bool): Enable bias for qkv if True. Default: False. + drop_rate (float): Probability of an element to be zeroed. + Default 0. + attn_drop_rate (float): The drop out rate for attention layer. + Default 0.0 + drop_path_rate (float): Stochastic depth rate. Default 0.0 + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN') + norm_after_stage(bool, List[bool]): Add extra norm after each stage. + Default False. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + + Examples: + >>> from mmcls.models import PCPVT + >>> import torch + >>> pcpvt_cfg = {'arch': "small", + >>> 'norm_after_stage': [False, False, False, True]} + >>> model = PCPVT(**pcpvt_cfg) + >>> x = torch.rand(1, 3, 224, 224) + >>> outputs = model(x) + >>> print(outputs[-1].shape) + torch.Size([1, 512, 7, 7]) + >>> pcpvt_cfg['norm_after_stage'] = [True, True, True, True] + >>> pcpvt_cfg['out_indices'] = (0, 1, 2, 3) + >>> model = PCPVT(**pcpvt_cfg) + >>> outputs = model(x) + >>> for feat in outputs: + >>> print(feat.shape) + torch.Size([1, 64, 56, 56]) + torch.Size([1, 128, 28, 28]) + torch.Size([1, 320, 14, 14]) + torch.Size([1, 512, 7, 7]) + """ + arch_zoo = { + **dict.fromkeys(['s', 'small'], + {'embed_dims': [64, 128, 320, 512], + 'depths': [3, 4, 6, 3], + 'num_heads': [1, 2, 5, 8], + 'patch_sizes': [4, 2, 2, 2], + 'strides': [4, 2, 2, 2], + 'mlp_ratios': [8, 8, 4, 4], + 'sr_ratios': [8, 4, 2, 1]}), + **dict.fromkeys(['b', 'base'], + {'embed_dims': [64, 128, 320, 512], + 'depths': [3, 4, 18, 3], + 'num_heads': [1, 2, 5, 8], + 'patch_sizes': [4, 2, 2, 2], + 'strides': [4, 2, 2, 2], + 'mlp_ratios': [8, 8, 4, 4], + 'sr_ratios': [8, 4, 2, 1]}), + **dict.fromkeys(['l', 'large'], + {'embed_dims': [64, 128, 320, 512], + 'depths': [3, 8, 27, 3], + 'num_heads': [1, 2, 5, 8], + 'patch_sizes': [4, 2, 2, 2], + 'strides': [4, 2, 2, 2], + 'mlp_ratios': [8, 8, 4, 4], + 'sr_ratios': [8, 4, 2, 1]}), + } # yapf: disable + + essential_keys = { + 'embed_dims', 'depths', 'num_heads', 'patch_sizes', 'strides', + 'mlp_ratios', 'sr_ratios' + } + + def __init__(self, + arch, + in_channels=3, + out_indices=(3, ), + qkv_bias=False, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_cfg=dict(type='LN'), + norm_after_stage=False, + init_cfg=None): + super(PCPVT, self).__init__(init_cfg=init_cfg) + if isinstance(arch, str): + arch = arch.lower() + assert arch in set(self.arch_zoo), \ + f'Arch {arch} is not in default archs {set(self.arch_zoo)}' + self.arch_settings = self.arch_zoo[arch] + else: + assert isinstance(arch, dict) and ( + set(arch) == self.essential_keys + ), f'Custom arch needs a dict with keys {self.essential_keys}.' + self.arch_settings = arch + + self.depths = self.arch_settings['depths'] + self.embed_dims = self.arch_settings['embed_dims'] + self.patch_sizes = self.arch_settings['patch_sizes'] + self.strides = self.arch_settings['strides'] + self.mlp_ratios = self.arch_settings['mlp_ratios'] + self.num_heads = self.arch_settings['num_heads'] + self.sr_ratios = self.arch_settings['sr_ratios'] + + self.num_extra_tokens = 0 # there is no cls-token in Twins + self.num_stage = len(self.depths) + for key, value in self.arch_settings.items(): + assert isinstance(value, list) and len(value) == self.num_stage, ( + 'Length of setting item in arch dict must be type of list and' + ' have the same length.') + + # patch_embeds + self.patch_embeds = ModuleList() + self.position_encoding_drops = ModuleList() + self.stages = ModuleList() + + for i in range(self.num_stage): + # use in_channels of the model in the first stage + if i == 0: + stage_in_channels = in_channels + else: + stage_in_channels = self.embed_dims[i - 1] + + self.patch_embeds.append( + PatchEmbed( + in_channels=stage_in_channels, + embed_dims=self.embed_dims[i], + conv_type='Conv2d', + kernel_size=self.patch_sizes[i], + stride=self.strides[i], + padding='corner', + norm_cfg=dict(type='LN'))) + + self.position_encoding_drops.append(nn.Dropout(p=drop_rate)) + + # PEGs + self.position_encodings = ModuleList([ + ConditionalPositionEncoding(embed_dim, embed_dim) + for embed_dim in self.embed_dims + ]) + + # stochastic depth + total_depth = sum(self.depths) + self.dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, total_depth) + ] # stochastic depth decay rule + cur = 0 + + for k in range(len(self.depths)): + _block = ModuleList([ + GSAEncoderLayer( + embed_dims=self.embed_dims[k], + num_heads=self.num_heads[k], + feedforward_channels=self.mlp_ratios[k] * + self.embed_dims[k], + attn_drop_rate=attn_drop_rate, + drop_rate=drop_rate, + drop_path_rate=self.dpr[cur + i], + num_fcs=2, + qkv_bias=qkv_bias, + act_cfg=dict(type='GELU'), + norm_cfg=norm_cfg, + sr_ratio=self.sr_ratios[k]) for i in range(self.depths[k]) + ]) + self.stages.append(_block) + cur += self.depths[k] + + self.out_indices = out_indices + + assert isinstance(norm_after_stage, (bool, list)) + if isinstance(norm_after_stage, bool): + self.norm_after_stage = [norm_after_stage] * self.num_stage + else: + self.norm_after_stage = norm_after_stage + assert len(self.norm_after_stage) == self.num_stage, \ + (f'Number of norm_after_stage({len(self.norm_after_stage)}) should' + f' be equal to the number of stages({self.num_stage}).') + + for i, has_norm in enumerate(self.norm_after_stage): + assert isinstance(has_norm, bool), 'norm_after_stage should be ' \ + 'bool or List[bool].' + if has_norm and norm_cfg is not None: + norm_layer = build_norm_layer(norm_cfg, self.embed_dims[i])[1] + else: + norm_layer = nn.Identity() + + self.add_module(f'norm_after_stage{i}', norm_layer) + + def init_weights(self): + if self.init_cfg is not None: + super(PCPVT, self).init_weights() + else: + for m in self.modules(): + if isinstance(m, nn.Linear): + trunc_normal_init(m, std=.02, bias=0.) + elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)): + constant_init(m, val=1.0, bias=0.) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[ + 1] * m.out_channels + fan_out //= m.groups + normal_init( + m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0) + + def forward(self, x): + outputs = list() + + b = x.shape[0] + + for i in range(self.num_stage): + x, hw_shape = self.patch_embeds[i](x) + h, w = hw_shape + x = self.position_encoding_drops[i](x) + for j, blk in enumerate(self.stages[i]): + x = blk(x, hw_shape) + if j == 0: + x = self.position_encodings[i](x, hw_shape) + + norm_layer = getattr(self, f'norm_after_stage{i}') + x = norm_layer(x) + x = x.reshape(b, h, w, -1).permute(0, 3, 1, 2).contiguous() + + if i in self.out_indices: + outputs.append(x) + + return tuple(outputs) + + +@BACKBONES.register_module() +class SVT(PCPVT): + """The backbone of Twins-SVT. + + This backbone is the implementation of `Twins: Revisiting the Design + of Spatial Attention in Vision Transformers + `_. + + Args: + arch (dict, str): SVT architecture, a str value in arch zoo or a + detailed configuration dict with 8 keys, and the length of all the + values in dict should be the same: + + - depths (List[int]): The number of encoder layers in each stage. + - embed_dims (List[int]): Embedding dimension in each stage. + - patch_sizes (List[int]): The patch sizes in each stage. + - num_heads (List[int]): Numbers of attention head in each stage. + - strides (List[int]): The strides in each stage. + - mlp_ratios (List[int]): The ratios of mlp in each stage. + - sr_ratios (List[int]): The ratios of GSA-encoder layers in each + stage. + - windiow_sizes (List[int]): The window sizes in LSA-encoder layers + in each stage. + + in_channels (int): Number of input channels. Default: 3. + out_indices (tuple[int]): Output from which stages. + Default: (3, ). + qkv_bias (bool): Enable bias for qkv if True. Default: False. + drop_rate (float): Dropout rate. Default 0. + attn_drop_rate (float): Dropout ratio of attention weight. + Default 0.0 + drop_path_rate (float): Stochastic depth rate. Default 0.2. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN') + norm_after_stage(bool, List[bool]): Add extra norm after each stage. + Default False. + init_cfg (dict, optional): The Config for initialization. + Defaults to None. + + Examples: + >>> from mmcls.models import SVT + >>> import torch + >>> svt_cfg = {'arch': "small", + >>> 'norm_after_stage': [False, False, False, True]} + >>> model = SVT(**svt_cfg) + >>> x = torch.rand(1, 3, 224, 224) + >>> outputs = model(x) + >>> print(outputs[-1].shape) + torch.Size([1, 512, 7, 7]) + >>> svt_cfg["out_indices"] = (0, 1, 2, 3) + >>> svt_cfg["norm_after_stage"] = [True, True, True, True] + >>> model = SVT(**svt_cfg) + >>> output = model(x) + >>> for feat in output: + >>> print(feat.shape) + torch.Size([1, 64, 56, 56]) + torch.Size([1, 128, 28, 28]) + torch.Size([1, 320, 14, 14]) + torch.Size([1, 512, 7, 7]) + """ + arch_zoo = { + **dict.fromkeys(['s', 'small'], + {'embed_dims': [64, 128, 256, 512], + 'depths': [2, 2, 10, 4], + 'num_heads': [2, 4, 8, 16], + 'patch_sizes': [4, 2, 2, 2], + 'strides': [4, 2, 2, 2], + 'mlp_ratios': [4, 4, 4, 4], + 'sr_ratios': [8, 4, 2, 1], + 'window_sizes': [7, 7, 7, 7]}), + **dict.fromkeys(['b', 'base'], + {'embed_dims': [96, 192, 384, 768], + 'depths': [2, 2, 18, 2], + 'num_heads': [3, 6, 12, 24], + 'patch_sizes': [4, 2, 2, 2], + 'strides': [4, 2, 2, 2], + 'mlp_ratios': [4, 4, 4, 4], + 'sr_ratios': [8, 4, 2, 1], + 'window_sizes': [7, 7, 7, 7]}), + **dict.fromkeys(['l', 'large'], + {'embed_dims': [128, 256, 512, 1024], + 'depths': [2, 2, 18, 2], + 'num_heads': [4, 8, 16, 32], + 'patch_sizes': [4, 2, 2, 2], + 'strides': [4, 2, 2, 2], + 'mlp_ratios': [4, 4, 4, 4], + 'sr_ratios': [8, 4, 2, 1], + 'window_sizes': [7, 7, 7, 7]}), + } # yapf: disable + + essential_keys = { + 'embed_dims', 'depths', 'num_heads', 'patch_sizes', 'strides', + 'mlp_ratios', 'sr_ratios', 'window_sizes' + } + + def __init__(self, + arch, + in_channels=3, + out_indices=(3, ), + qkv_bias=False, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0.0, + norm_cfg=dict(type='LN'), + norm_after_stage=False, + init_cfg=None): + super(SVT, self).__init__(arch, in_channels, out_indices, qkv_bias, + drop_rate, attn_drop_rate, drop_path_rate, + norm_cfg, norm_after_stage, init_cfg) + + self.window_sizes = self.arch_settings['window_sizes'] + + for k in range(self.num_stage): + for i in range(self.depths[k]): + # in even-numbered layers of each stage, replace GSA with LSA + if i % 2 == 0: + ffn_channels = self.mlp_ratios[k] * self.embed_dims[k] + self.stages[k][i] = \ + LSAEncoderLayer( + embed_dims=self.embed_dims[k], + num_heads=self.num_heads[k], + feedforward_channels=ffn_channels, + drop_rate=drop_rate, + norm_cfg=norm_cfg, + attn_drop_rate=attn_drop_rate, + drop_path_rate=self.dpr[sum(self.depths[:k])+i], + qkv_bias=qkv_bias, + window_size=self.window_sizes[k]) diff --git a/mmcls/models/losses/asymmetric_loss.py b/mmcls/models/losses/asymmetric_loss.py index bc4aa1b4110..1c3b5744926 100644 --- a/mmcls/models/losses/asymmetric_loss.py +++ b/mmcls/models/losses/asymmetric_loss.py @@ -3,7 +3,7 @@ import torch.nn as nn from ..builder import LOSSES -from .utils import weight_reduce_loss +from .utils import convert_to_one_hot, weight_reduce_loss def asymmetric_loss(pred, @@ -13,7 +13,9 @@ def asymmetric_loss(pred, gamma_neg=4.0, clip=0.05, reduction='mean', - avg_factor=None): + avg_factor=None, + use_sigmoid=True, + eps=1e-8): r"""asymmetric loss. Please refer to the `paper `__ for @@ -34,6 +36,10 @@ def asymmetric_loss(pred, is same shape as pred and label. Defaults to 'mean'. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. + use_sigmoid (bool): Whether the prediction uses sigmoid instead + of softmax. Defaults to True. + eps (float): The minimum value of the argument of logarithm. Defaults + to 1e-8. Returns: torch.Tensor: Loss. @@ -41,8 +47,11 @@ def asymmetric_loss(pred, assert pred.shape == \ target.shape, 'pred and target should be in the same shape.' - eps = 1e-8 - pred_sigmoid = pred.sigmoid() + if use_sigmoid: + pred_sigmoid = pred.sigmoid() + else: + pred_sigmoid = nn.functional.softmax(pred, dim=-1) + target = target.type_as(pred) if clip and clip > 0: @@ -75,6 +84,10 @@ class AsymmetricLoss(nn.Module): reduction (str): The method used to reduce the loss into a scalar. loss_weight (float): Weight of loss. Defaults to 1.0. + use_sigmoid (bool): Whether the prediction uses sigmoid instead + of softmax. Defaults to True. + eps (float): The minimum value of the argument of logarithm. Defaults + to 1e-8. """ def __init__(self, @@ -82,13 +95,17 @@ def __init__(self, gamma_neg=4.0, clip=0.05, reduction='mean', - loss_weight=1.0): + loss_weight=1.0, + use_sigmoid=True, + eps=1e-8): super(AsymmetricLoss, self).__init__() self.gamma_pos = gamma_pos self.gamma_neg = gamma_neg self.clip = clip self.reduction = reduction self.loss_weight = loss_weight + self.use_sigmoid = use_sigmoid + self.eps = eps def forward(self, pred, @@ -96,10 +113,28 @@ def forward(self, weight=None, avg_factor=None, reduction_override=None): - """asymmetric loss.""" + r"""asymmetric loss. + + Args: + pred (torch.Tensor): The prediction with shape (N, \*). + target (torch.Tensor): The ground truth label of the prediction + with shape (N, \*), N or (N,1). + weight (torch.Tensor, optional): Sample-wise loss weight with shape + (N, \*). Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The method used to reduce the + loss into a scalar. Options are "none", "mean" and "sum". + Defaults to None. + + Returns: + torch.Tensor: Loss. + """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) + if target.dim() == 1 or (target.dim() == 2 and target.shape[1] == 1): + target = convert_to_one_hot(target.view(-1, 1), pred.shape[-1]) loss_cls = self.loss_weight * asymmetric_loss( pred, target, @@ -108,5 +143,7 @@ def forward(self, gamma_neg=self.gamma_neg, clip=self.clip, reduction=reduction, - avg_factor=avg_factor) + avg_factor=avg_factor, + use_sigmoid=self.use_sigmoid, + eps=self.eps) return loss_cls diff --git a/mmcls/models/necks/__init__.py b/mmcls/models/necks/__init__.py index 67053fe6811..6f3ae47c08a 100644 --- a/mmcls/models/necks/__init__.py +++ b/mmcls/models/necks/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from .gap import GlobalAveragePooling +from .hr_fuse import HRFuseScales -__all__ = ['GlobalAveragePooling'] +__all__ = ['GlobalAveragePooling', 'HRFuseScales'] diff --git a/mmcls/models/necks/hr_fuse.py b/mmcls/models/necks/hr_fuse.py new file mode 100644 index 00000000000..1acc382756b --- /dev/null +++ b/mmcls/models/necks/hr_fuse.py @@ -0,0 +1,83 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.cnn.bricks import ConvModule +from mmcv.runner import BaseModule + +from ..backbones.resnet import Bottleneck, ResLayer +from ..builder import NECKS + + +@NECKS.register_module() +class HRFuseScales(BaseModule): + """Fuse feature map of multiple scales in HRNet. + + Args: + in_channels (list[int]): The input channels of all scales. + out_channels (int): The channels of fused feature map. + Defaults to 2048. + norm_cfg (dict): dictionary to construct norm layers. + Defaults to ``dict(type='BN', momentum=0.1)``. + init_cfg (dict | list[dict], optional): Initialization config dict. + Defaults to ``dict(type='Normal', layer='Linear', std=0.01))``. + """ + + def __init__(self, + in_channels, + out_channels=2048, + norm_cfg=dict(type='BN', momentum=0.1), + init_cfg=dict(type='Normal', layer='Linear', std=0.01)): + super(HRFuseScales, self).__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + self.norm_cfg = norm_cfg + + block_type = Bottleneck + out_channels = [128, 256, 512, 1024] + + # Increase the channels on each resolution + # from C, 2C, 4C, 8C to 128, 256, 512, 1024 + increase_layers = [] + for i in range(len(in_channels)): + increase_layers.append( + ResLayer( + block_type, + in_channels=in_channels[i], + out_channels=out_channels[i], + num_blocks=1, + stride=1, + )) + self.increase_layers = nn.ModuleList(increase_layers) + + # Downsample feature maps in each scale. + downsample_layers = [] + for i in range(len(in_channels) - 1): + downsample_layers.append( + ConvModule( + in_channels=out_channels[i], + out_channels=out_channels[i + 1], + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg, + bias=False, + )) + self.downsample_layers = nn.ModuleList(downsample_layers) + + # The final conv block before final classifier linear layer. + self.final_layer = ConvModule( + in_channels=out_channels[3], + out_channels=self.out_channels, + kernel_size=1, + norm_cfg=self.norm_cfg, + bias=False, + ) + + def forward(self, x): + assert isinstance(x, tuple) and len(x) == len(self.in_channels) + + feat = self.increase_layers[0](x[0]) + for i in range(len(self.downsample_layers)): + feat = self.downsample_layers[i](feat) + \ + self.increase_layers[i + 1](x[i + 1]) + + return (self.final_layer(feat), ) diff --git a/mmcls/models/utils/__init__.py b/mmcls/models/utils/__init__.py index 69e9a4c133b..aaf30c3e646 100644 --- a/mmcls/models/utils/__init__.py +++ b/mmcls/models/utils/__init__.py @@ -6,11 +6,12 @@ from .helpers import is_tracing, to_2tuple, to_3tuple, to_4tuple, to_ntuple from .inverted_residual import InvertedResidual from .make_divisible import make_divisible +from .position_encoding import ConditionalPositionEncoding from .se_layer import SELayer __all__ = [ 'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer', 'to_ntuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'PatchEmbed', 'PatchMerging', 'HybridEmbed', 'Augments', 'ShiftWindowMSA', 'is_tracing', - 'MultiheadAttention' + 'MultiheadAttention', 'ConditionalPositionEncoding' ] diff --git a/mmcls/models/utils/augment/cutmix.py b/mmcls/models/utils/augment/cutmix.py index 215e878d012..45d758dfe84 100644 --- a/mmcls/models/utils/augment/cutmix.py +++ b/mmcls/models/utils/augment/cutmix.py @@ -3,9 +3,9 @@ import numpy as np import torch -import torch.nn.functional as F from .builder import AUGMENT +from .utils import one_hot_encoding class BaseCutMixLayer(object, metaclass=ABCMeta): @@ -123,7 +123,7 @@ def __init__(self, *args, **kwargs): super(BatchCutMixLayer, self).__init__(*args, **kwargs) def cutmix(self, img, gt_label): - one_hot_gt_label = F.one_hot(gt_label, num_classes=self.num_classes) + one_hot_gt_label = one_hot_encoding(gt_label, self.num_classes) lam = np.random.beta(self.alpha, self.alpha) batch_size = img.size(0) index = torch.randperm(batch_size) diff --git a/mmcls/models/utils/augment/identity.py b/mmcls/models/utils/augment/identity.py index e676fc42237..ae3a3df52ff 100644 --- a/mmcls/models/utils/augment/identity.py +++ b/mmcls/models/utils/augment/identity.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -import torch.nn.functional as F - from .builder import AUGMENT +from .utils import one_hot_encoding @AUGMENT.register_module(name='Identity') @@ -24,7 +23,7 @@ def __init__(self, num_classes, prob=1.0): self.prob = prob def one_hot(self, gt_label): - return F.one_hot(gt_label, num_classes=self.num_classes) + return one_hot_encoding(gt_label, self.num_classes) def __call__(self, img, gt_label): return img, self.one_hot(gt_label) diff --git a/mmcls/models/utils/augment/mixup.py b/mmcls/models/utils/augment/mixup.py index 2d6cd2b534e..17c20704ce7 100644 --- a/mmcls/models/utils/augment/mixup.py +++ b/mmcls/models/utils/augment/mixup.py @@ -3,9 +3,9 @@ import numpy as np import torch -import torch.nn.functional as F from .builder import AUGMENT +from .utils import one_hot_encoding class BaseMixupLayer(object, metaclass=ABCMeta): @@ -42,7 +42,7 @@ def __init__(self, *args, **kwargs): super(BatchMixupLayer, self).__init__(*args, **kwargs) def mixup(self, img, gt_label): - one_hot_gt_label = F.one_hot(gt_label, num_classes=self.num_classes) + one_hot_gt_label = one_hot_encoding(gt_label, self.num_classes) lam = np.random.beta(self.alpha, self.alpha) batch_size = img.size(0) index = torch.randperm(batch_size) diff --git a/mmcls/models/utils/augment/utils.py b/mmcls/models/utils/augment/utils.py new file mode 100644 index 00000000000..0544af3ec15 --- /dev/null +++ b/mmcls/models/utils/augment/utils.py @@ -0,0 +1,23 @@ +import torch.nn.functional as F + + +def one_hot_encoding(gt, num_classes): + """Change gt_label to one_hot encoding. + + If the shape has 2 or more + dimensions, return it without encoding. + Args: + gt (Tensor): The gt label with shape (N,) or shape (N, */). + num_classes (int): The number of classes. + Return: + Tensor: One hot gt label. + """ + if gt.ndim == 1: + # multi-class classification + return F.one_hot(gt, num_classes=num_classes) + else: + # binary classification + # example. [[0], [1], [1]] + # multi-label classification + # example. [[0, 1, 1], [1, 0, 0], [1, 1, 1]] + return gt diff --git a/mmcls/models/utils/inverted_residual.py b/mmcls/models/utils/inverted_residual.py index d2e9fba6e35..7c432943b5b 100644 --- a/mmcls/models/utils/inverted_residual.py +++ b/mmcls/models/utils/inverted_residual.py @@ -1,35 +1,35 @@ # Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn import torch.utils.checkpoint as cp from mmcv.cnn import ConvModule +from mmcv.cnn.bricks import DropPath from mmcv.runner import BaseModule from .se_layer import SELayer -# class InvertedResidual(nn.Module): class InvertedResidual(BaseModule): """Inverted Residual Block. Args: - in_channels (int): The input channels of this Module. - out_channels (int): The output channels of this Module. + in_channels (int): The input channels of this module. + out_channels (int): The output channels of this module. mid_channels (int): The input channels of the depthwise convolution. kernel_size (int): The kernel size of the depthwise convolution. - Default: 3. - stride (int): The stride of the depthwise convolution. Default: 1. - se_cfg (dict): Config dict for se layer. Default: None, which means no - se layer. - conv_cfg (dict): Config dict for convolution layer. Default: None, + Defaults to 3. + stride (int): The stride of the depthwise convolution. Defaults to 1. + se_cfg (dict, optional): Config dict for se layer. Defaults to None, + which means no se layer. + conv_cfg (dict): Config dict for convolution layer. Defaults to None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. - Default: dict(type='BN'). + Defaults to ``dict(type='BN')``. act_cfg (dict): Config dict for activation layer. - Default: dict(type='ReLU'). + Defaults to ``dict(type='ReLU')``. + drop_path_rate (float): stochastic depth rate. Defaults to 0. with_cp (bool): Use checkpoint or not. Using checkpoint will save some - memory while slowing down the training speed. Default: False. - - Returns: - Tensor: The output tensor. + memory while slowing down the training speed. Defaults to False. + init_cfg (dict | list[dict], optional): Initialization config dict. """ def __init__(self, @@ -42,12 +42,15 @@ def __init__(self, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), + drop_path_rate=0., with_cp=False, init_cfg=None): super(InvertedResidual, self).__init__(init_cfg) self.with_res_shortcut = (stride == 1 and in_channels == out_channels) assert stride in [1, 2] self.with_cp = with_cp + self.drop_path = DropPath( + drop_path_rate) if drop_path_rate > 0 else nn.Identity() self.with_se = se_cfg is not None self.with_expand_conv = (mid_channels != in_channels) @@ -87,6 +90,14 @@ def __init__(self, act_cfg=None) def forward(self, x): + """Forward function. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The output tensor. + """ def _inner_forward(x): out = x @@ -102,7 +113,7 @@ def _inner_forward(x): out = self.linear_conv(out) if self.with_res_shortcut: - return x + out + return x + self.drop_path(out) else: return out diff --git a/mmcls/models/utils/position_encoding.py b/mmcls/models/utils/position_encoding.py new file mode 100644 index 00000000000..0bd597ca0b5 --- /dev/null +++ b/mmcls/models/utils/position_encoding.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.runner.base_module import BaseModule + + +class ConditionalPositionEncoding(BaseModule): + """The Conditional Position Encoding (CPE) module. + + The CPE is the implementation of 'Conditional Positional Encodings + for Vision Transformers '_. + + Args: + in_channels (int): Number of input channels. + embed_dims (int): The feature dimension. Default: 768. + stride (int): Stride of conv layer. Default: 1. + """ + + def __init__(self, in_channels, embed_dims=768, stride=1, init_cfg=None): + super(ConditionalPositionEncoding, self).__init__(init_cfg=init_cfg) + self.proj = nn.Conv2d( + in_channels, + embed_dims, + kernel_size=3, + stride=stride, + padding=1, + bias=True, + groups=embed_dims) + self.stride = stride + + def forward(self, x, hw_shape): + B, N, C = x.shape + H, W = hw_shape + feat_token = x + # convert (B, N, C) to (B, C, H, W) + cnn_feat = feat_token.transpose(1, 2).view(B, C, H, W) + if self.stride == 1: + x = self.proj(cnn_feat) + cnn_feat + else: + x = self.proj(cnn_feat) + x = x.flatten(2).transpose(1, 2) + return x diff --git a/mmcls/utils/__init__.py b/mmcls/utils/__init__.py index 22ddae6439d..4afaf6ff4cc 100644 --- a/mmcls/utils/__init__.py +++ b/mmcls/utils/__init__.py @@ -1,5 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. from .collect_env import collect_env from .logger import get_root_logger, load_json_log +from .setup_env import setup_multi_processes -__all__ = ['collect_env', 'get_root_logger', 'load_json_log'] +__all__ = [ + 'collect_env', 'get_root_logger', 'load_json_log', 'setup_multi_processes' +] diff --git a/mmcls/utils/setup_env.py b/mmcls/utils/setup_env.py new file mode 100644 index 00000000000..21def2f0809 --- /dev/null +++ b/mmcls/utils/setup_env.py @@ -0,0 +1,47 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import platform +import warnings + +import cv2 +import torch.multiprocessing as mp + + +def setup_multi_processes(cfg): + """Setup multi-processing environment variables.""" + # set multi-process start method as `fork` to speed up the training + if platform.system() != 'Windows': + mp_start_method = cfg.get('mp_start_method', 'fork') + current_method = mp.get_start_method(allow_none=True) + if current_method is not None and current_method != mp_start_method: + warnings.warn( + f'Multi-processing start method `{mp_start_method}` is ' + f'different from the previous setting `{current_method}`.' + f'It will be force set to `{mp_start_method}`. You can change ' + f'this behavior by changing `mp_start_method` in your config.') + mp.set_start_method(mp_start_method, force=True) + + # disable opencv multithreading to avoid system being overloaded + opencv_num_threads = cfg.get('opencv_num_threads', 0) + cv2.setNumThreads(opencv_num_threads) + + # setup OMP threads + # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py # noqa + if 'OMP_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1: + omp_num_threads = 1 + warnings.warn( + f'Setting OMP_NUM_THREADS environment variable for each process ' + f'to be {omp_num_threads} in default, to avoid your system being ' + f'overloaded, please further tune the variable for optimal ' + f'performance in your application as needed.') + os.environ['OMP_NUM_THREADS'] = str(omp_num_threads) + + # setup MKL threads + if 'MKL_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1: + mkl_num_threads = 1 + warnings.warn( + f'Setting MKL_NUM_THREADS environment variable for each process ' + f'to be {mkl_num_threads} in default, to avoid your system being ' + f'overloaded, please further tune the variable for optimal ' + f'performance in your application as needed.') + os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads) diff --git a/mmcls/version.py b/mmcls/version.py index 4d28f23da5b..5f1c47e4ea1 100644 --- a/mmcls/version.py +++ b/mmcls/version.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved -__version__ = '0.19.0' +__version__ = '0.20.0' def parse_version_info(version_str): diff --git a/model-index.yml b/model-index.yml index f6e167baa42..b8e05151de9 100644 --- a/model-index.yml +++ b/model-index.yml @@ -16,3 +16,7 @@ Import: - configs/conformer/metafile.yml - configs/regnet/metafile.yml - configs/deit/metafile.yml + - configs/twins/metafile.yml + - configs/efficientnet/metafile.yml + - configs/convnext/metafile.yml + - configs/hrnet/metafile.yml diff --git a/requirements/optional.txt b/requirements/optional.txt index ca3c4e779e9..5cf26773c70 100644 --- a/requirements/optional.txt +++ b/requirements/optional.txt @@ -1,2 +1,2 @@ -albumentations>=0.3.2 --no-binary imgaug,albumentations +albumentations>=0.3.2 requests diff --git a/setup.cfg b/setup.cfg index 54d18d66f3f..f15ce980ae3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,7 +14,7 @@ line_length = 79 multi_line_output = 0 known_standard_library = pkg_resources,setuptools known_first_party = mmcls -known_third_party = PIL,matplotlib,mmcv,mmdet,modelindex,numpy,onnxruntime,packaging,pytest,pytorch_sphinx_theme,requests,rich,sphinx,torch,torchvision,ts +known_third_party = PIL,cv2,matplotlib,mmcv,mmdet,modelindex,numpy,onnxruntime,packaging,pytest,pytorch_sphinx_theme,requests,rich,sphinx,tensorflow,torch,torchvision,ts no_lines_before = STDLIB,LOCALFOLDER default_section = THIRDPARTY diff --git a/setup.py b/setup.py index 3f9368a2d56..0ed33fb5c92 100644 --- a/setup.py +++ b/setup.py @@ -163,10 +163,7 @@ def add_mim_extension(): description='OpenMMLab Image Classification Toolbox and Benchmark', long_description=readme(), long_description_content_type='text/markdown', - author='MMClassification Contributors', - author_email='openmmlab@gmail.com', keywords='computer vision, image classification', - url='https://github.com/open-mmlab/mmclassification', packages=find_packages(exclude=('configs', 'tools', 'demo')), include_package_data=True, classifiers=[ @@ -179,8 +176,16 @@ def add_mim_extension(): 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', ], + url='https://github.com/open-mmlab/mmclassification', + author='MMClassification Contributors', + author_email='openmmlab@gmail.com', license='Apache License 2.0', - tests_require=parse_requirements('requirements/tests.txt'), install_requires=parse_requirements('requirements/runtime.txt'), + extras_require={ + 'all': parse_requirements('requirements.txt'), + 'tests': parse_requirements('requirements/tests.txt'), + 'optional': parse_requirements('requirements/optional.txt'), + }, zip_safe=False) diff --git a/tests/data/retinanet.py b/tests/data/retinanet.py index 2c38ae52796..e00120915b5 100644 --- a/tests/data/retinanet.py +++ b/tests/data/retinanet.py @@ -1,5 +1,5 @@ # small RetinaNet -num_classes=3 +num_classes = 3 # model settings model = dict( diff --git a/tests/test_data/test_builder.py b/tests/test_data/test_builder.py index 534a52e47a1..44d34890074 100644 --- a/tests/test_data/test_builder.py +++ b/tests/test_data/test_builder.py @@ -1,9 +1,14 @@ +import os.path as osp +from copy import deepcopy from unittest.mock import patch import torch from mmcv.utils import digit_version -from mmcls.datasets import build_dataloader +from mmcls.datasets import ImageNet, build_dataloader, build_dataset +from mmcls.datasets.dataset_wrappers import (ClassBalancedDataset, + ConcatDataset, KFoldDataset, + RepeatDataset) class TestDataloaderBuilder(): @@ -119,3 +124,148 @@ def test_distributed(self, _): expect = torch.tensor( [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6][1::2]) assert all(torch.cat(list(iter(dataloader))) == expect) + + +class TestDatasetBuilder(): + + @classmethod + def setup_class(cls): + data_prefix = osp.join(osp.dirname(__file__), '../data/dataset') + cls.dataset_cfg = dict( + type='ImageNet', + data_prefix=data_prefix, + ann_file=osp.join(data_prefix, 'ann.txt'), + pipeline=[], + test_mode=False, + ) + + def test_normal_dataset(self): + # Test build + dataset = build_dataset(self.dataset_cfg) + assert isinstance(dataset, ImageNet) + assert dataset.test_mode == self.dataset_cfg['test_mode'] + + # Test default_args + dataset = build_dataset(self.dataset_cfg, {'test_mode': True}) + assert dataset.test_mode == self.dataset_cfg['test_mode'] + + cp_cfg = deepcopy(self.dataset_cfg) + cp_cfg.pop('test_mode') + dataset = build_dataset(cp_cfg, {'test_mode': True}) + assert dataset.test_mode + + def test_concat_dataset(self): + # Test build + dataset = build_dataset([self.dataset_cfg, self.dataset_cfg]) + assert isinstance(dataset, ConcatDataset) + assert dataset.datasets[0].test_mode == self.dataset_cfg['test_mode'] + + # Test default_args + dataset = build_dataset([self.dataset_cfg, self.dataset_cfg], + {'test_mode': True}) + assert dataset.datasets[0].test_mode == self.dataset_cfg['test_mode'] + + cp_cfg = deepcopy(self.dataset_cfg) + cp_cfg.pop('test_mode') + dataset = build_dataset([cp_cfg, cp_cfg], {'test_mode': True}) + assert dataset.datasets[0].test_mode + + def test_repeat_dataset(self): + # Test build + dataset = build_dataset( + dict(type='RepeatDataset', dataset=self.dataset_cfg, times=3)) + assert isinstance(dataset, RepeatDataset) + assert dataset.dataset.test_mode == self.dataset_cfg['test_mode'] + + # Test default_args + dataset = build_dataset( + dict(type='RepeatDataset', dataset=self.dataset_cfg, times=3), + {'test_mode': True}) + assert dataset.dataset.test_mode == self.dataset_cfg['test_mode'] + + cp_cfg = deepcopy(self.dataset_cfg) + cp_cfg.pop('test_mode') + dataset = build_dataset( + dict(type='RepeatDataset', dataset=cp_cfg, times=3), + {'test_mode': True}) + assert dataset.dataset.test_mode + + def test_class_balance_dataset(self): + # Test build + dataset = build_dataset( + dict( + type='ClassBalancedDataset', + dataset=self.dataset_cfg, + oversample_thr=1., + )) + assert isinstance(dataset, ClassBalancedDataset) + assert dataset.dataset.test_mode == self.dataset_cfg['test_mode'] + + # Test default_args + dataset = build_dataset( + dict( + type='ClassBalancedDataset', + dataset=self.dataset_cfg, + oversample_thr=1., + ), {'test_mode': True}) + assert dataset.dataset.test_mode == self.dataset_cfg['test_mode'] + + cp_cfg = deepcopy(self.dataset_cfg) + cp_cfg.pop('test_mode') + dataset = build_dataset( + dict( + type='ClassBalancedDataset', + dataset=cp_cfg, + oversample_thr=1., + ), {'test_mode': True}) + assert dataset.dataset.test_mode + + def test_kfold_dataset(self): + # Test build + dataset = build_dataset( + dict( + type='KFoldDataset', + dataset=self.dataset_cfg, + fold=0, + num_splits=5, + test_mode=False, + )) + assert isinstance(dataset, KFoldDataset) + assert not dataset.test_mode + assert dataset.dataset.test_mode == self.dataset_cfg['test_mode'] + + # Test default_args + dataset = build_dataset( + dict( + type='KFoldDataset', + dataset=self.dataset_cfg, + fold=0, + num_splits=5, + test_mode=False, + ), + default_args={ + 'test_mode': True, + 'classes': [1, 2, 3] + }) + assert not dataset.test_mode + assert dataset.dataset.test_mode == self.dataset_cfg['test_mode'] + assert dataset.dataset.CLASSES == [1, 2, 3] + + cp_cfg = deepcopy(self.dataset_cfg) + cp_cfg.pop('test_mode') + dataset = build_dataset( + dict( + type='KFoldDataset', + dataset=self.dataset_cfg, + fold=0, + num_splits=5, + ), + default_args={ + 'test_mode': True, + 'classes': [1, 2, 3] + }) + # The test_mode in default_args will be passed to KFoldDataset + assert dataset.test_mode + assert not dataset.dataset.test_mode + # Other default_args will be passed to child dataset. + assert dataset.dataset.CLASSES == [1, 2, 3] diff --git a/tests/test_data/test_datasets/test_dataset_wrapper.py b/tests/test_data/test_datasets/test_dataset_wrapper.py index 27a18dcae72..2798e1fbbaf 100644 --- a/tests/test_data/test_datasets/test_dataset_wrapper.py +++ b/tests/test_data/test_datasets/test_dataset_wrapper.py @@ -8,7 +8,20 @@ import pytest from mmcls.datasets import (BaseDataset, ClassBalancedDataset, ConcatDataset, - RepeatDataset) + KFoldDataset, RepeatDataset) + + +def mock_evaluate(results, + metric='accuracy', + metric_options=None, + indices=None, + logger=None): + return dict( + results=results, + metric=metric, + metric_options=metric_options, + indices=indices, + logger=logger) @patch.multiple(BaseDataset, __abstractmethods__=set()) @@ -23,6 +36,8 @@ def construct_toy_multi_label_dataset(length): dataset.data_infos = MagicMock() dataset.data_infos.__len__.return_value = length dataset.get_cat_ids = MagicMock(side_effect=lambda idx: cat_ids_list[idx]) + + dataset.evaluate = MagicMock(side_effect=mock_evaluate) return dataset, cat_ids_list @@ -35,6 +50,7 @@ def construct_toy_single_label_dataset(length): dataset.data_infos = MagicMock() dataset.data_infos.__len__.return_value = length dataset.get_cat_ids = MagicMock(side_effect=lambda idx: cat_ids_list[idx]) + dataset.evaluate = MagicMock(side_effect=mock_evaluate) return dataset, cat_ids_list @@ -107,3 +123,49 @@ def test_class_balanced_dataset(construct_dataset): for idx in np.random.randint(0, len(repeat_factor_dataset), 3): assert repeat_factor_dataset[idx] == bisect.bisect_right( repeat_factors_cumsum, idx) + + +@pytest.mark.parametrize('construct_dataset', [ + 'construct_toy_multi_label_dataset', 'construct_toy_single_label_dataset' +]) +def test_kfold_dataset(construct_dataset): + construct_toy_dataset = eval(construct_dataset) + dataset, _ = construct_toy_dataset(10) + + # test without random seed + train_datasets = [ + KFoldDataset(dataset, fold=i, num_splits=3, test_mode=False) + for i in range(5) + ] + test_datasets = [ + KFoldDataset(dataset, fold=i, num_splits=3, test_mode=True) + for i in range(5) + ] + + assert sum([i.indices for i in test_datasets], []) == list(range(10)) + for train_set, test_set in zip(train_datasets, test_datasets): + train_samples = [train_set[i] for i in range(len(train_set))] + test_samples = [test_set[i] for i in range(len(test_set))] + assert set(train_samples + test_samples) == set(range(10)) + + # test with random seed + train_datasets = [ + KFoldDataset(dataset, fold=i, num_splits=3, test_mode=False, seed=1) + for i in range(5) + ] + test_datasets = [ + KFoldDataset(dataset, fold=i, num_splits=3, test_mode=True, seed=1) + for i in range(5) + ] + + assert sum([i.indices for i in test_datasets], []) != list(range(10)) + assert set(sum([i.indices for i in test_datasets], [])) == set(range(10)) + for train_set, test_set in zip(train_datasets, test_datasets): + train_samples = [train_set[i] for i in range(len(train_set))] + test_samples = [test_set[i] for i in range(len(test_set))] + assert set(train_samples + test_samples) == set(range(10)) + + # test evaluate + for test_set in test_datasets: + eval_inputs = test_set.evaluate(None) + assert eval_inputs['indices'] == test_set.indices diff --git a/tests/test_downstream/test_mmdet_inference.py b/tests/test_downstream/test_mmdet_inference.py index ba431136807..6da3ba16d29 100644 --- a/tests/test_downstream/test_mmdet_inference.py +++ b/tests/test_downstream/test_mmdet_inference.py @@ -5,7 +5,9 @@ from mmdet.models import build_detector from mmcls.models import (MobileNetV2, MobileNetV3, RegNet, ResNeSt, ResNet, - ResNeXt, SEResNet, SEResNeXt, SwinTransformer) + ResNeXt, SEResNet, SEResNeXt, SwinTransformer, + TIMMBackbone) +from mmcls.models.backbones.timm_backbone import timm backbone_configs = dict( mobilenetv2=dict( @@ -52,7 +54,23 @@ img_size=800, out_indices=(2, 3), auto_pad=True), - out_channels=[384, 768])) + out_channels=[384, 768]), + timm_efficientnet=dict( + backbone=dict( + type='mmcls.TIMMBackbone', + model_name='efficientnet_b1', + features_only=True, + pretrained=False, + out_indices=(1, 2, 3, 4)), + out_channels=[24, 40, 112, 320]), + timm_resnet=dict( + backbone=dict( + type='mmcls.TIMMBackbone', + model_name='resnet50', + features_only=True, + pretrained=False, + out_indices=(1, 2, 3, 4)), + out_channels=[256, 512, 1024, 2048])) module_mapping = { 'mobilenetv2': MobileNetV2, @@ -63,7 +81,9 @@ 'seresnext': SEResNeXt, 'seresnet': SEResNet, 'resnest': ResNeSt, - 'swin': SwinTransformer + 'swin': SwinTransformer, + 'timm_efficientnet': TIMMBackbone, + 'timm_resnet': TIMMBackbone } @@ -73,6 +93,11 @@ def test_mmdet_inference(): img1 = rng.rand(100, 100, 3) for module_name, backbone_config in backbone_configs.items(): + module = module_mapping[module_name] + if module is TIMMBackbone and timm is None: + print(f'skip {module_name} because timm is not available') + continue + print(f'test {module_name}') config = Config.fromfile(config_path) config.model.backbone = backbone_config['backbone'] out_channels = backbone_config['out_channels'] @@ -85,12 +110,10 @@ def test_mmdet_inference(): config.model.neck.in_channels = out_channels model = build_detector(config.model) - module = module_mapping[module_name] assert isinstance(model.backbone, module) model.cfg = config model.eval() - print(module_name) result = inference_detector(model, img1) assert len(result) == config.num_classes diff --git a/tests/test_metrics/test_losses.py b/tests/test_metrics/test_losses.py index 8b67fd29ce4..74eec620548 100644 --- a/tests/test_metrics/test_losses.py +++ b/tests/test_metrics/test_losses.py @@ -36,6 +36,46 @@ def test_asymmetric_loss(): loss = build_loss(loss_cfg) assert torch.allclose(loss(cls_score, label), torch.tensor(5.1186 / 3)) + # test asymmetric_loss with softmax for single label task + cls_score = torch.Tensor([[5, -5, 0], [5, -5, 0]]) + label = torch.Tensor([0, 1]) + weight = torch.tensor([0.5, 0.5]) + loss_cfg = dict( + type='AsymmetricLoss', + gamma_pos=0.0, + gamma_neg=0.0, + clip=None, + reduction='mean', + loss_weight=1.0, + use_sigmoid=False, + eps=1e-8) + loss = build_loss(loss_cfg) + # test asymmetric_loss for single label task without weight + assert torch.allclose(loss(cls_score, label), torch.tensor(2.5045)) + # test asymmetric_loss for single label task with weight + assert torch.allclose( + loss(cls_score, label, weight=weight), torch.tensor(2.5045 * 0.5)) + + # test soft asymmetric_loss with softmax + cls_score = torch.Tensor([[5, -5, 0], [5, -5, 0]]) + label = torch.Tensor([[1, 0, 0], [0, 1, 0]]) + weight = torch.tensor([0.5, 0.5]) + loss_cfg = dict( + type='AsymmetricLoss', + gamma_pos=0.0, + gamma_neg=0.0, + clip=None, + reduction='mean', + loss_weight=1.0, + use_sigmoid=False, + eps=1e-8) + loss = build_loss(loss_cfg) + # test soft asymmetric_loss with softmax without weight + assert torch.allclose(loss(cls_score, label), torch.tensor(2.5045)) + # test soft asymmetric_loss with softmax with weight + assert torch.allclose( + loss(cls_score, label, weight=weight), torch.tensor(2.5045 * 0.5)) + def test_cross_entropy_loss(): with pytest.raises(AssertionError): diff --git a/tests/test_models/test_backbones/test_convnext.py b/tests/test_models/test_backbones/test_convnext.py new file mode 100644 index 00000000000..35448b458b1 --- /dev/null +++ b/tests/test_models/test_backbones/test_convnext.py @@ -0,0 +1,86 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmcls.models.backbones import ConvNeXt + + +def test_assertion(): + with pytest.raises(AssertionError): + ConvNeXt(arch='unknown') + + with pytest.raises(AssertionError): + # ConvNeXt arch dict should include 'embed_dims', + ConvNeXt(arch=dict(channels=[2, 3, 4, 5])) + + with pytest.raises(AssertionError): + # ConvNeXt arch dict should include 'embed_dims', + ConvNeXt(arch=dict(depths=[2, 3, 4], channels=[2, 3, 4, 5])) + + +def test_convnext(): + + # Test forward + model = ConvNeXt(arch='tiny', out_indices=-1) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 1 + assert feat[0].shape == torch.Size([1, 768]) + + # Test forward with multiple outputs + model = ConvNeXt(arch='small', out_indices=(0, 1, 2, 3)) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size([1, 96]) + assert feat[1].shape == torch.Size([1, 192]) + assert feat[2].shape == torch.Size([1, 384]) + assert feat[3].shape == torch.Size([1, 768]) + + # Test with custom arch + model = ConvNeXt( + arch={ + 'depths': [2, 3, 4, 5, 6], + 'channels': [16, 32, 64, 128, 256] + }, + out_indices=(0, 1, 2, 3, 4)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 5 + assert feat[0].shape == torch.Size([1, 16]) + assert feat[1].shape == torch.Size([1, 32]) + assert feat[2].shape == torch.Size([1, 64]) + assert feat[3].shape == torch.Size([1, 128]) + assert feat[4].shape == torch.Size([1, 256]) + + # Test without gap before final norm + model = ConvNeXt( + arch='small', out_indices=(0, 1, 2, 3), gap_before_final_norm=False) + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 4 + assert feat[0].shape == torch.Size([1, 96, 56, 56]) + assert feat[1].shape == torch.Size([1, 192, 28, 28]) + assert feat[2].shape == torch.Size([1, 384, 14, 14]) + assert feat[3].shape == torch.Size([1, 768, 7, 7]) + + # Test frozen_stages + model = ConvNeXt(arch='small', out_indices=(0, 1, 2, 3), frozen_stages=2) + model.init_weights() + model.train() + + for i in range(2): + assert not model.downsample_layers[i].training + assert not model.stages[i].training + + for i in range(2, 4): + assert model.downsample_layers[i].training + assert model.stages[i].training diff --git a/tests/test_models/test_backbones/test_efficientnet.py b/tests/test_models/test_backbones/test_efficientnet.py new file mode 100644 index 00000000000..3d3a70d160c --- /dev/null +++ b/tests/test_models/test_backbones/test_efficientnet.py @@ -0,0 +1,143 @@ +import pytest +import torch +from torch.nn.modules import GroupNorm +from torch.nn.modules.batchnorm import _BatchNorm + +from mmcls.models.backbones import EfficientNet + + +def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True + + +def test_efficientnet_backbone(): + archs = ['b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b7', 'b8', 'es', 'em', 'el'] + with pytest.raises(TypeError): + # pretrained must be a string path + model = EfficientNet() + model.init_weights(pretrained=0) + + with pytest.raises(AssertionError): + # arch must in arc_settings + EfficientNet(arch='others') + + for arch in archs: + with pytest.raises(ValueError): + # frozen_stages must less than 7 + EfficientNet(arch=arch, frozen_stages=12) + + # Test EfficientNet + model = EfficientNet() + model.init_weights() + model.train() + + # Test EfficientNet with first stage frozen + frozen_stages = 7 + model = EfficientNet(arch='b0', frozen_stages=frozen_stages) + model.init_weights() + model.train() + for i in range(frozen_stages): + layer = model.layers[i] + for mod in layer.modules(): + if isinstance(mod, _BatchNorm): + assert mod.training is False + for param in layer.parameters(): + assert param.requires_grad is False + + # Test EfficientNet with norm eval + model = EfficientNet(norm_eval=True) + model.init_weights() + model.train() + assert check_norm_state(model.modules(), False) + + # Test EfficientNet forward with 'b0' arch + out_channels = [32, 16, 24, 40, 112, 320, 1280] + model = EfficientNet(arch='b0', out_indices=(0, 1, 2, 3, 4, 5, 6)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size([1, out_channels[0], 112, 112]) + assert feat[1].shape == torch.Size([1, out_channels[1], 112, 112]) + assert feat[2].shape == torch.Size([1, out_channels[2], 56, 56]) + assert feat[3].shape == torch.Size([1, out_channels[3], 28, 28]) + assert feat[4].shape == torch.Size([1, out_channels[4], 14, 14]) + assert feat[5].shape == torch.Size([1, out_channels[5], 7, 7]) + assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7]) + + # Test EfficientNet forward with 'b0' arch and GroupNorm + out_channels = [32, 16, 24, 40, 112, 320, 1280] + model = EfficientNet( + arch='b0', + out_indices=(0, 1, 2, 3, 4, 5, 6), + norm_cfg=dict(type='GN', num_groups=2, requires_grad=True)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, GroupNorm) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size([1, out_channels[0], 112, 112]) + assert feat[1].shape == torch.Size([1, out_channels[1], 112, 112]) + assert feat[2].shape == torch.Size([1, out_channels[2], 56, 56]) + assert feat[3].shape == torch.Size([1, out_channels[3], 28, 28]) + assert feat[4].shape == torch.Size([1, out_channels[4], 14, 14]) + assert feat[5].shape == torch.Size([1, out_channels[5], 7, 7]) + assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7]) + + # Test EfficientNet forward with 'es' arch + out_channels = [32, 24, 32, 48, 144, 192, 1280] + model = EfficientNet(arch='es', out_indices=(0, 1, 2, 3, 4, 5, 6)) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size([1, out_channels[0], 112, 112]) + assert feat[1].shape == torch.Size([1, out_channels[1], 112, 112]) + assert feat[2].shape == torch.Size([1, out_channels[2], 56, 56]) + assert feat[3].shape == torch.Size([1, out_channels[3], 28, 28]) + assert feat[4].shape == torch.Size([1, out_channels[4], 14, 14]) + assert feat[5].shape == torch.Size([1, out_channels[5], 7, 7]) + assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7]) + + # Test EfficientNet forward with 'es' arch and GroupNorm + out_channels = [32, 24, 32, 48, 144, 192, 1280] + model = EfficientNet( + arch='es', + out_indices=(0, 1, 2, 3, 4, 5, 6), + norm_cfg=dict(type='GN', num_groups=2, requires_grad=True)) + for m in model.modules(): + if is_norm(m): + assert isinstance(m, GroupNorm) + model.init_weights() + model.train() + + imgs = torch.randn(1, 3, 224, 224) + feat = model(imgs) + assert len(feat) == 7 + assert feat[0].shape == torch.Size([1, out_channels[0], 112, 112]) + assert feat[1].shape == torch.Size([1, out_channels[1], 112, 112]) + assert feat[2].shape == torch.Size([1, out_channels[2], 56, 56]) + assert feat[3].shape == torch.Size([1, out_channels[3], 28, 28]) + assert feat[4].shape == torch.Size([1, out_channels[4], 14, 14]) + assert feat[5].shape == torch.Size([1, out_channels[5], 7, 7]) + assert feat[6].shape == torch.Size([1, out_channels[6], 7, 7]) diff --git a/tests/test_models/test_backbones/test_hrnet.py b/tests/test_models/test_backbones/test_hrnet.py new file mode 100644 index 00000000000..cb9909a8923 --- /dev/null +++ b/tests/test_models/test_backbones/test_hrnet.py @@ -0,0 +1,93 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch +from torch.nn.modules import GroupNorm +from torch.nn.modules.batchnorm import _BatchNorm + +from mmcls.models.backbones import HRNet + + +def is_norm(modules): + """Check if is one of the norms.""" + if isinstance(modules, (GroupNorm, _BatchNorm)): + return True + return False + + +def check_norm_state(modules, train_state): + """Check if norm layer is in correct train state.""" + for mod in modules: + if isinstance(mod, _BatchNorm): + if mod.training != train_state: + return False + return True + + +@pytest.mark.parametrize('base_channels', [18, 30, 32, 40, 44, 48, 64]) +def test_hrnet_arch_zoo(base_channels): + + cfg_ori = dict(arch=f'w{base_channels}') + + # Test HRNet model with input size of 224 + model = HRNet(**cfg_ori) + model.init_weights() + model.train() + + assert check_norm_state(model.modules(), True) + + imgs = torch.randn(3, 3, 224, 224) + outs = model(imgs) + out_channels = base_channels + out_size = 56 + assert isinstance(outs, tuple) + for out in outs: + assert out.shape == (3, out_channels, out_size, out_size) + out_channels = out_channels * 2 + out_size = out_size // 2 + + +def test_hrnet_custom_arch(): + + cfg_ori = dict( + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BOTTLENECK', + num_blocks=(4, 4, 2), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 3, 4, 4), + num_channels=(32, 64, 152, 256)), + ), ) + + # Test HRNet model with input size of 224 + model = HRNet(**cfg_ori) + model.init_weights() + model.train() + + assert check_norm_state(model.modules(), True) + + imgs = torch.randn(3, 3, 224, 224) + outs = model(imgs) + out_channels = (32, 64, 152, 256) + out_size = 56 + assert isinstance(outs, tuple) + for out, out_channel in zip(outs, out_channels): + assert out.shape == (3, out_channel, out_size, out_size) + out_size = out_size // 2 diff --git a/tests/test_models/test_backbones/test_timm_backbone.py b/tests/test_models/test_backbones/test_timm_backbone.py index 1ab06879c6b..4c6ae925dbe 100644 --- a/tests/test_models/test_backbones/test_timm_backbone.py +++ b/tests/test_models/test_backbones/test_timm_backbone.py @@ -17,10 +17,14 @@ def check_norm_state(modules, train_state): def test_timm_backbone(): + """Test timm backbones, features_only=False (default).""" with pytest.raises(TypeError): - # pretrained must be a string path - model = TIMMBackbone() - model.init_weights(pretrained=0) + # TIMMBackbone has 1 required positional argument: 'model_name' + model = TIMMBackbone(pretrained=True) + + with pytest.raises(TypeError): + # pretrained must be bool + model = TIMMBackbone(model_name='resnet18', pretrained='model.pth') # Test resnet18 from timm model = TIMMBackbone(model_name='resnet18') @@ -57,3 +61,143 @@ def test_timm_backbone(): feat = model(imgs) assert len(feat) == 1 assert feat[0].shape == torch.Size((1, 192)) + + +def test_timm_backbone_features_only(): + """Test timm backbones, features_only=True.""" + # Test different norm_layer, can be: 'SyncBN', 'BN2d', 'GN', 'LN', 'IN' + # Test resnet18 from timm, norm_layer='BN2d' + model = TIMMBackbone( + model_name='resnet18', + features_only=True, + pretrained=False, + output_stride=32, + norm_layer='BN2d') + + # Test resnet18 from timm, norm_layer='SyncBN' + model = TIMMBackbone( + model_name='resnet18', + features_only=True, + pretrained=False, + output_stride=32, + norm_layer='SyncBN') + + # Test resnet18 from timm, output_stride=32 + model = TIMMBackbone( + model_name='resnet18', + features_only=True, + pretrained=False, + output_stride=32) + model.init_weights() + model.train() + assert check_norm_state(model.modules(), True) + + imgs = torch.randn(1, 3, 224, 224) + feats = model(imgs) + assert len(feats) == 5 + assert feats[0].shape == torch.Size((1, 64, 112, 112)) + assert feats[1].shape == torch.Size((1, 64, 56, 56)) + assert feats[2].shape == torch.Size((1, 128, 28, 28)) + assert feats[3].shape == torch.Size((1, 256, 14, 14)) + assert feats[4].shape == torch.Size((1, 512, 7, 7)) + + # Test resnet18 from timm, output_stride=32, out_indices=(1, 2, 3) + model = TIMMBackbone( + model_name='resnet18', + features_only=True, + pretrained=False, + output_stride=32, + out_indices=(1, 2, 3)) + imgs = torch.randn(1, 3, 224, 224) + feats = model(imgs) + assert len(feats) == 3 + assert feats[0].shape == torch.Size((1, 64, 56, 56)) + assert feats[1].shape == torch.Size((1, 128, 28, 28)) + assert feats[2].shape == torch.Size((1, 256, 14, 14)) + + # Test resnet18 from timm, output_stride=16 + model = TIMMBackbone( + model_name='resnet18', + features_only=True, + pretrained=False, + output_stride=16) + imgs = torch.randn(1, 3, 224, 224) + feats = model(imgs) + assert len(feats) == 5 + assert feats[0].shape == torch.Size((1, 64, 112, 112)) + assert feats[1].shape == torch.Size((1, 64, 56, 56)) + assert feats[2].shape == torch.Size((1, 128, 28, 28)) + assert feats[3].shape == torch.Size((1, 256, 14, 14)) + assert feats[4].shape == torch.Size((1, 512, 14, 14)) + + # Test resnet18 from timm, output_stride=8 + model = TIMMBackbone( + model_name='resnet18', + features_only=True, + pretrained=False, + output_stride=8) + imgs = torch.randn(1, 3, 224, 224) + feats = model(imgs) + assert len(feats) == 5 + assert feats[0].shape == torch.Size((1, 64, 112, 112)) + assert feats[1].shape == torch.Size((1, 64, 56, 56)) + assert feats[2].shape == torch.Size((1, 128, 28, 28)) + assert feats[3].shape == torch.Size((1, 256, 28, 28)) + assert feats[4].shape == torch.Size((1, 512, 28, 28)) + + # Test efficientnet_b1 with pretrained weights + model = TIMMBackbone( + model_name='efficientnet_b1', features_only=True, pretrained=True) + imgs = torch.randn(1, 3, 64, 64) + feats = model(imgs) + assert len(feats) == 5 + assert feats[0].shape == torch.Size((1, 16, 32, 32)) + assert feats[1].shape == torch.Size((1, 24, 16, 16)) + assert feats[2].shape == torch.Size((1, 40, 8, 8)) + assert feats[3].shape == torch.Size((1, 112, 4, 4)) + assert feats[4].shape == torch.Size((1, 320, 2, 2)) + + # Test resnetv2_50x1_bitm from timm, output_stride=8 + model = TIMMBackbone( + model_name='resnetv2_50x1_bitm', + features_only=True, + pretrained=False, + output_stride=8) + imgs = torch.randn(1, 3, 8, 8) + feats = model(imgs) + assert len(feats) == 5 + assert feats[0].shape == torch.Size((1, 64, 4, 4)) + assert feats[1].shape == torch.Size((1, 256, 2, 2)) + assert feats[2].shape == torch.Size((1, 512, 1, 1)) + assert feats[3].shape == torch.Size((1, 1024, 1, 1)) + assert feats[4].shape == torch.Size((1, 2048, 1, 1)) + + # Test resnetv2_50x3_bitm from timm, output_stride=8 + model = TIMMBackbone( + model_name='resnetv2_50x3_bitm', + features_only=True, + pretrained=False, + output_stride=8) + imgs = torch.randn(1, 3, 8, 8) + feats = model(imgs) + assert len(feats) == 5 + assert feats[0].shape == torch.Size((1, 192, 4, 4)) + assert feats[1].shape == torch.Size((1, 768, 2, 2)) + assert feats[2].shape == torch.Size((1, 1536, 1, 1)) + assert feats[3].shape == torch.Size((1, 3072, 1, 1)) + assert feats[4].shape == torch.Size((1, 6144, 1, 1)) + + # Test resnetv2_101x1_bitm from timm, output_stride=8 + model = TIMMBackbone( + model_name='resnetv2_101x1_bitm', + features_only=True, + pretrained=False, + output_stride=8) + imgs = torch.randn(1, 3, 8, 8) + feats = model(imgs) + assert len(feats) == 5 + assert feats[0].shape == torch.Size((1, 64, 4, 4)) + assert feats[1].shape == torch.Size((1, 256, 2, 2)) + assert feats[2].shape == torch.Size((1, 512, 1, 1)) + assert feats[3].shape == torch.Size((1, 1024, 1, 1)) + assert feats[4].shape == torch.Size((1, 2048, 1, 1)) diff --git a/tests/test_models/test_backbones/test_twins.py b/tests/test_models/test_backbones/test_twins.py new file mode 100644 index 00000000000..b692584315e --- /dev/null +++ b/tests/test_models/test_backbones/test_twins.py @@ -0,0 +1,243 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import pytest +import torch +import torch.nn as nn + +from mmcls.models.backbones.twins import (PCPVT, SVT, + GlobalSubsampledAttention, + LocallyGroupedSelfAttention) + + +def test_LSA_module(): + lsa = LocallyGroupedSelfAttention(embed_dims=32, window_size=3) + outs = lsa(torch.randn(1, 3136, 32), (56, 56)) + assert outs.shape == torch.Size([1, 3136, 32]) + + +def test_GSA_module(): + gsa = GlobalSubsampledAttention(embed_dims=32, num_heads=8) + outs = gsa(torch.randn(1, 3136, 32), (56, 56)) + assert outs.shape == torch.Size([1, 3136, 32]) + + +def test_pcpvt(): + # test init + path = 'PATH_THAT_DO_NOT_EXIST' + + # init_cfg loads pretrain from an non-existent file + model = PCPVT('s', init_cfg=dict(type='Pretrained', checkpoint=path)) + assert model.init_cfg == dict(type='Pretrained', checkpoint=path) + + # Test loading a checkpoint from an non-existent file + with pytest.raises(OSError): + model.init_weights() + + # init_cfg=123, whose type is unsupported + model = PCPVT('s', init_cfg=123) + with pytest.raises(TypeError): + model.init_weights() + + H, W = (64, 64) + temp = torch.randn((1, 3, H, W)) + + # test output last feat + model = PCPVT('small') + model.init_weights() + outs = model(temp) + assert len(outs) == 1 + assert outs[-1].shape == (1, 512, H // 32, W // 32) + + # test with mutil outputs + model = PCPVT('small', out_indices=(0, 1, 2, 3)) + model.init_weights() + outs = model(temp) + assert len(outs) == 4 + assert outs[0].shape == (1, 64, H // 4, W // 4) + assert outs[1].shape == (1, 128, H // 8, W // 8) + assert outs[2].shape == (1, 320, H // 16, W // 16) + assert outs[3].shape == (1, 512, H // 32, W // 32) + + # test with arch of dict + arch = { + 'embed_dims': [64, 128, 320, 512], + 'depths': [3, 4, 18, 3], + 'num_heads': [1, 2, 5, 8], + 'patch_sizes': [4, 2, 2, 2], + 'strides': [4, 2, 2, 2], + 'mlp_ratios': [8, 8, 4, 4], + 'sr_ratios': [8, 4, 2, 1] + } + + pcpvt_arch = copy.deepcopy(arch) + model = PCPVT(pcpvt_arch, out_indices=(0, 1, 2, 3)) + model.init_weights() + outs = model(temp) + assert len(outs) == 4 + assert outs[0].shape == (1, 64, H // 4, W // 4) + assert outs[1].shape == (1, 128, H // 8, W // 8) + assert outs[2].shape == (1, 320, H // 16, W // 16) + assert outs[3].shape == (1, 512, H // 32, W // 32) + + # assert length of arch value not equal + pcpvt_arch = copy.deepcopy(arch) + pcpvt_arch['sr_ratios'] = [8, 4, 2] + with pytest.raises(AssertionError): + model = PCPVT(pcpvt_arch, out_indices=(0, 1, 2, 3)) + + # assert lack arch essential_keys + pcpvt_arch = copy.deepcopy(arch) + del pcpvt_arch['sr_ratios'] + with pytest.raises(AssertionError): + model = PCPVT(pcpvt_arch, out_indices=(0, 1, 2, 3)) + + # assert arch value not list + pcpvt_arch = copy.deepcopy(arch) + pcpvt_arch['sr_ratios'] = 1 + with pytest.raises(AssertionError): + model = PCPVT(pcpvt_arch, out_indices=(0, 1, 2, 3)) + + pcpvt_arch = copy.deepcopy(arch) + pcpvt_arch['sr_ratios'] = '1, 2, 3, 4' + with pytest.raises(AssertionError): + model = PCPVT(pcpvt_arch, out_indices=(0, 1, 2, 3)) + + # test norm_after_stage is bool True + model = PCPVT('small', norm_after_stage=True, norm_cfg=dict(type='LN')) + for i in range(model.num_stage): + assert hasattr(model, f'norm_after_stage{i}') + assert isinstance(getattr(model, f'norm_after_stage{i}'), nn.LayerNorm) + + # test norm_after_stage is bool Flase + model = PCPVT('small', norm_after_stage=False) + for i in range(model.num_stage): + assert hasattr(model, f'norm_after_stage{i}') + assert isinstance(getattr(model, f'norm_after_stage{i}'), nn.Identity) + + # test norm_after_stage is bool list + norm_after_stage = [False, True, False, True] + model = PCPVT('small', norm_after_stage=norm_after_stage) + assert len(norm_after_stage) == model.num_stage + for i in range(model.num_stage): + assert hasattr(model, f'norm_after_stage{i}') + norm_layer = getattr(model, f'norm_after_stage{i}') + if norm_after_stage[i]: + assert isinstance(norm_layer, nn.LayerNorm) + else: + assert isinstance(norm_layer, nn.Identity) + + # test norm_after_stage is not bool list + norm_after_stage = [False, 'True', False, True] + with pytest.raises(AssertionError): + model = PCPVT('small', norm_after_stage=norm_after_stage) + + +def test_svt(): + # test init + path = 'PATH_THAT_DO_NOT_EXIST' + + # init_cfg loads pretrain from an non-existent file + model = SVT('s', init_cfg=dict(type='Pretrained', checkpoint=path)) + assert model.init_cfg == dict(type='Pretrained', checkpoint=path) + + # Test loading a checkpoint from an non-existent file + with pytest.raises(OSError): + model.init_weights() + + # init_cfg=123, whose type is unsupported + model = SVT('s', init_cfg=123) + with pytest.raises(TypeError): + model.init_weights() + + # Test feature map output + H, W = (64, 64) + temp = torch.randn((1, 3, H, W)) + + model = SVT('s') + model.init_weights() + outs = model(temp) + assert len(outs) == 1 + assert outs[-1].shape == (1, 512, H // 32, W // 32) + + # test with mutil outputs + model = SVT('small', out_indices=(0, 1, 2, 3)) + model.init_weights() + outs = model(temp) + assert len(outs) == 4 + assert outs[0].shape == (1, 64, H // 4, W // 4) + assert outs[1].shape == (1, 128, H // 8, W // 8) + assert outs[2].shape == (1, 256, H // 16, W // 16) + assert outs[3].shape == (1, 512, H // 32, W // 32) + + # test with arch of dict + arch = { + 'embed_dims': [96, 192, 384, 768], + 'depths': [2, 2, 18, 2], + 'num_heads': [3, 6, 12, 24], + 'patch_sizes': [4, 2, 2, 2], + 'strides': [4, 2, 2, 2], + 'mlp_ratios': [4, 4, 4, 4], + 'sr_ratios': [8, 4, 2, 1], + 'window_sizes': [7, 7, 7, 7] + } + model = SVT(arch, out_indices=(0, 1, 2, 3)) + model.init_weights() + outs = model(temp) + assert len(outs) == 4 + assert outs[0].shape == (1, 96, H // 4, W // 4) + assert outs[1].shape == (1, 192, H // 8, W // 8) + assert outs[2].shape == (1, 384, H // 16, W // 16) + assert outs[3].shape == (1, 768, H // 32, W // 32) + + # assert length of arch value not equal + svt_arch = copy.deepcopy(arch) + svt_arch['sr_ratios'] = [8, 4, 2] + with pytest.raises(AssertionError): + model = SVT(svt_arch, out_indices=(0, 1, 2, 3)) + + # assert lack arch essential_keys + svt_arch = copy.deepcopy(arch) + del svt_arch['window_sizes'] + with pytest.raises(AssertionError): + model = SVT(svt_arch, out_indices=(0, 1, 2, 3)) + + # assert arch value not list + svt_arch = copy.deepcopy(arch) + svt_arch['sr_ratios'] = 1 + with pytest.raises(AssertionError): + model = SVT(svt_arch, out_indices=(0, 1, 2, 3)) + + svt_arch = copy.deepcopy(arch) + svt_arch['sr_ratios'] = '1, 2, 3, 4' + with pytest.raises(AssertionError): + model = SVT(svt_arch, out_indices=(0, 1, 2, 3)) + + # test norm_after_stage is bool True + model = SVT('small', norm_after_stage=True, norm_cfg=dict(type='LN')) + for i in range(model.num_stage): + assert hasattr(model, f'norm_after_stage{i}') + assert isinstance(getattr(model, f'norm_after_stage{i}'), nn.LayerNorm) + + # test norm_after_stage is bool Flase + model = SVT('small', norm_after_stage=False) + for i in range(model.num_stage): + assert hasattr(model, f'norm_after_stage{i}') + assert isinstance(getattr(model, f'norm_after_stage{i}'), nn.Identity) + + # test norm_after_stage is bool list + norm_after_stage = [False, True, False, True] + model = SVT('small', norm_after_stage=norm_after_stage) + assert len(norm_after_stage) == model.num_stage + for i in range(model.num_stage): + assert hasattr(model, f'norm_after_stage{i}') + norm_layer = getattr(model, f'norm_after_stage{i}') + if norm_after_stage[i]: + assert isinstance(norm_layer, nn.LayerNorm) + else: + assert isinstance(norm_layer, nn.Identity) + + # test norm_after_stage is not bool list + norm_after_stage = [False, 'True', False, True] + with pytest.raises(AssertionError): + model = SVT('small', norm_after_stage=norm_after_stage) diff --git a/tests/test_models/test_neck.py b/tests/test_models/test_neck.py index c7c36443667..08e2e421927 100644 --- a/tests/test_models/test_neck.py +++ b/tests/test_models/test_neck.py @@ -2,7 +2,7 @@ import pytest import torch -from mmcls.models.necks import GlobalAveragePooling +from mmcls.models.necks import GlobalAveragePooling, HRFuseScales def test_gap_neck(): @@ -37,3 +37,24 @@ def test_gap_neck(): with pytest.raises(AssertionError): # dim must in [1, 2, 3] GlobalAveragePooling(dim='other') + + +def test_hr_fuse_scales(): + + in_channels = (18, 32, 64, 128) + neck = HRFuseScales(in_channels=in_channels, out_channels=1024) + + feat_size = 56 + inputs = [] + for in_channel in in_channels: + input_tensor = torch.rand(3, in_channel, feat_size, feat_size) + inputs.append(input_tensor) + feat_size = feat_size // 2 + + with pytest.raises(AssertionError): + neck(inputs) + + outs = neck(tuple(inputs)) + assert isinstance(outs, tuple) + assert len(outs) == 1 + assert outs[0].shape == (3, 1024, 7, 7) diff --git a/tests/test_models/test_utils/test_augment.py b/tests/test_models/test_utils/test_augment.py index dd7e1e0bbad..a037ad5c356 100644 --- a/tests/test_models/test_utils/test_augment.py +++ b/tests/test_models/test_utils/test_augment.py @@ -1,8 +1,15 @@ # Copyright (c) OpenMMLab. All rights reserved. +import pytest import torch from mmcls.models.utils import Augments +augment_cfgs = [ + dict(type='BatchCutMix', alpha=1., prob=1.), + dict(type='BatchMixup', alpha=1., prob=1.), + dict(type='Identity', prob=1.), +] + def test_augments(): imgs = torch.randn(4, 3, 32, 32) @@ -50,3 +57,31 @@ def test_augments(): mixed_imgs, mixed_labels = augs(imgs, labels) assert mixed_imgs.shape == torch.Size((4, 3, 32, 32)) assert mixed_labels.shape == torch.Size((4, 10)) + + +@pytest.mark.parametrize('cfg', augment_cfgs) +def test_binary_augment(cfg): + + cfg_ = dict(num_classes=1, **cfg) + augs = Augments(cfg_) + + imgs = torch.randn(4, 3, 32, 32) + labels = torch.randint(0, 2, (4, 1)).float() + + mixed_imgs, mixed_labels = augs(imgs, labels) + assert mixed_imgs.shape == torch.Size((4, 3, 32, 32)) + assert mixed_labels.shape == torch.Size((4, 1)) + + +@pytest.mark.parametrize('cfg', augment_cfgs) +def test_multilabel_augment(cfg): + + cfg_ = dict(num_classes=10, **cfg) + augs = Augments(cfg_) + + imgs = torch.randn(4, 3, 32, 32) + labels = torch.randint(0, 2, (4, 10)).float() + + mixed_imgs, mixed_labels = augs(imgs, labels) + assert mixed_imgs.shape == torch.Size((4, 3, 32, 32)) + assert mixed_labels.shape == torch.Size((4, 10)) diff --git a/tests/test_models/test_utils/test_position_encoding.py b/tests/test_models/test_utils/test_position_encoding.py new file mode 100644 index 00000000000..feb171c2496 --- /dev/null +++ b/tests/test_models/test_utils/test_position_encoding.py @@ -0,0 +1,10 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmcls.models.utils import ConditionalPositionEncoding + + +def test_conditional_position_encoding_module(): + CPE = ConditionalPositionEncoding(in_channels=32, embed_dims=32, stride=2) + outs = CPE(torch.randn(1, 3136, 32), (56, 56)) + assert outs.shape == torch.Size([1, 784, 32]) diff --git a/tests/test_utils/test_setup_env.py b/tests/test_utils/test_setup_env.py new file mode 100644 index 00000000000..2679dbbf5e2 --- /dev/null +++ b/tests/test_utils/test_setup_env.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import multiprocessing as mp +import os +import platform + +import cv2 +from mmcv import Config + +from mmcls.utils import setup_multi_processes + + +def test_setup_multi_processes(): + # temp save system setting + sys_start_mehod = mp.get_start_method(allow_none=True) + sys_cv_threads = cv2.getNumThreads() + # pop and temp save system env vars + sys_omp_threads = os.environ.pop('OMP_NUM_THREADS', default=None) + sys_mkl_threads = os.environ.pop('MKL_NUM_THREADS', default=None) + + # test config without setting env + config = dict(data=dict(workers_per_gpu=2)) + cfg = Config(config) + setup_multi_processes(cfg) + assert os.getenv('OMP_NUM_THREADS') == '1' + assert os.getenv('MKL_NUM_THREADS') == '1' + # when set to 0, the num threads will be 1 + assert cv2.getNumThreads() == 1 + if platform.system() != 'Windows': + assert mp.get_start_method() == 'fork' + + # test num workers <= 1 + os.environ.pop('OMP_NUM_THREADS') + os.environ.pop('MKL_NUM_THREADS') + config = dict(data=dict(workers_per_gpu=0)) + cfg = Config(config) + setup_multi_processes(cfg) + assert 'OMP_NUM_THREADS' not in os.environ + assert 'MKL_NUM_THREADS' not in os.environ + + # test manually set env var + os.environ['OMP_NUM_THREADS'] = '4' + config = dict(data=dict(workers_per_gpu=2)) + cfg = Config(config) + setup_multi_processes(cfg) + assert os.getenv('OMP_NUM_THREADS') == '4' + + # test manually set opencv threads and mp start method + config = dict( + data=dict(workers_per_gpu=2), + opencv_num_threads=4, + mp_start_method='spawn') + cfg = Config(config) + setup_multi_processes(cfg) + assert cv2.getNumThreads() == 4 + assert mp.get_start_method() == 'spawn' + + # revert setting to avoid affecting other programs + if sys_start_mehod: + mp.set_start_method(sys_start_mehod, force=True) + cv2.setNumThreads(sys_cv_threads) + if sys_omp_threads: + os.environ['OMP_NUM_THREADS'] = sys_omp_threads + else: + os.environ.pop('OMP_NUM_THREADS') + if sys_mkl_threads: + os.environ['MKL_NUM_THREADS'] = sys_mkl_threads + else: + os.environ.pop('MKL_NUM_THREADS') diff --git a/tools/convert_models/efficientnet_to_mmcls.py b/tools/convert_models/efficientnet_to_mmcls.py new file mode 100644 index 00000000000..cf889960996 --- /dev/null +++ b/tools/convert_models/efficientnet_to_mmcls.py @@ -0,0 +1,214 @@ +import argparse +import os + +import numpy as np +import torch +from mmcv.runner import Sequential +from tensorflow.python.training import py_checkpoint_reader + +from mmcls.models.backbones.efficientnet import EfficientNet + + +def tf2pth(v): + if v.ndim == 4: + return np.ascontiguousarray(v.transpose(3, 2, 0, 1)) + elif v.ndim == 2: + return np.ascontiguousarray(v.transpose()) + return v + + +def read_ckpt(ckpt): + reader = py_checkpoint_reader.NewCheckpointReader(ckpt) + weights = { + n: torch.as_tensor(tf2pth(reader.get_tensor(n))) + for (n, _) in reader.get_variable_to_shape_map().items() + } + return weights + + +def map_key(weight): + m = dict() + has_expand_conv = set() + is_MBConv = set() + max_idx = 0 + name = None + for k, v in weight.items(): + seg = k.split('/') + if len(seg) == 1: + continue + if 'edgetpu' in seg[0]: + name = 'e' + seg[0][21:].lower() + else: + name = seg[0][13:] + if seg[2] == 'tpu_batch_normalization_2': + has_expand_conv.add(seg[1]) + if seg[1].startswith('blocks_'): + idx = int(seg[1][7:]) + 1 + max_idx = max(max_idx, idx) + if 'depthwise' in k: + is_MBConv.add(seg[1]) + + model = EfficientNet(name) + idx2key = [] + for idx, module in enumerate(model.layers): + if isinstance(module, Sequential): + for j in range(len(module)): + idx2key.append('{}.{}'.format(idx, j)) + else: + idx2key.append('{}'.format(idx)) + + for k, v in weight.items(): + + if 'Exponential' in k or 'RMS' in k: + continue + + seg = k.split('/') + if len(seg) == 1: + continue + if seg[2] == 'depthwise_conv2d': + v = v.transpose(1, 0) + + if seg[1] == 'stem': + prefix = 'backbone.layers.{}'.format(idx2key[0]) + mapping = { + 'conv2d/kernel': 'conv.weight', + 'tpu_batch_normalization/beta': 'bn.bias', + 'tpu_batch_normalization/gamma': 'bn.weight', + 'tpu_batch_normalization/moving_mean': 'bn.running_mean', + 'tpu_batch_normalization/moving_variance': 'bn.running_var', + } + suffix = mapping['/'.join(seg[2:])] + m[prefix + '.' + suffix] = v + + elif seg[1].startswith('blocks_'): + idx = int(seg[1][7:]) + 1 + prefix = '.'.join(['backbone', 'layers', idx2key[idx]]) + if seg[1] not in is_MBConv: + mapping = { + 'conv2d/kernel': + 'conv1.conv.weight', + 'tpu_batch_normalization/gamma': + 'conv1.bn.weight', + 'tpu_batch_normalization/beta': + 'conv1.bn.bias', + 'tpu_batch_normalization/moving_mean': + 'conv1.bn.running_mean', + 'tpu_batch_normalization/moving_variance': + 'conv1.bn.running_var', + 'conv2d_1/kernel': + 'conv2.conv.weight', + 'tpu_batch_normalization_1/gamma': + 'conv2.bn.weight', + 'tpu_batch_normalization_1/beta': + 'conv2.bn.bias', + 'tpu_batch_normalization_1/moving_mean': + 'conv2.bn.running_mean', + 'tpu_batch_normalization_1/moving_variance': + 'conv2.bn.running_var', + } + else: + + base_mapping = { + 'depthwise_conv2d/depthwise_kernel': + 'depthwise_conv.conv.weight', + 'se/conv2d/kernel': 'se.conv1.conv.weight', + 'se/conv2d/bias': 'se.conv1.conv.bias', + 'se/conv2d_1/kernel': 'se.conv2.conv.weight', + 'se/conv2d_1/bias': 'se.conv2.conv.bias' + } + + if seg[1] not in has_expand_conv: + mapping = { + 'conv2d/kernel': + 'linear_conv.conv.weight', + 'tpu_batch_normalization/beta': + 'depthwise_conv.bn.bias', + 'tpu_batch_normalization/gamma': + 'depthwise_conv.bn.weight', + 'tpu_batch_normalization/moving_mean': + 'depthwise_conv.bn.running_mean', + 'tpu_batch_normalization/moving_variance': + 'depthwise_conv.bn.running_var', + 'tpu_batch_normalization_1/beta': + 'linear_conv.bn.bias', + 'tpu_batch_normalization_1/gamma': + 'linear_conv.bn.weight', + 'tpu_batch_normalization_1/moving_mean': + 'linear_conv.bn.running_mean', + 'tpu_batch_normalization_1/moving_variance': + 'linear_conv.bn.running_var', + } + else: + mapping = { + 'depthwise_conv2d/depthwise_kernel': + 'depthwise_conv.conv.weight', + 'conv2d/kernel': + 'expand_conv.conv.weight', + 'conv2d_1/kernel': + 'linear_conv.conv.weight', + 'tpu_batch_normalization/beta': + 'expand_conv.bn.bias', + 'tpu_batch_normalization/gamma': + 'expand_conv.bn.weight', + 'tpu_batch_normalization/moving_mean': + 'expand_conv.bn.running_mean', + 'tpu_batch_normalization/moving_variance': + 'expand_conv.bn.running_var', + 'tpu_batch_normalization_1/beta': + 'depthwise_conv.bn.bias', + 'tpu_batch_normalization_1/gamma': + 'depthwise_conv.bn.weight', + 'tpu_batch_normalization_1/moving_mean': + 'depthwise_conv.bn.running_mean', + 'tpu_batch_normalization_1/moving_variance': + 'depthwise_conv.bn.running_var', + 'tpu_batch_normalization_2/beta': + 'linear_conv.bn.bias', + 'tpu_batch_normalization_2/gamma': + 'linear_conv.bn.weight', + 'tpu_batch_normalization_2/moving_mean': + 'linear_conv.bn.running_mean', + 'tpu_batch_normalization_2/moving_variance': + 'linear_conv.bn.running_var', + } + mapping.update(base_mapping) + suffix = mapping['/'.join(seg[2:])] + m[prefix + '.' + suffix] = v + elif seg[1] == 'head': + seq_key = idx2key[max_idx + 1] + mapping = { + 'conv2d/kernel': + 'backbone.layers.{}.conv.weight'.format(seq_key), + 'tpu_batch_normalization/beta': + 'backbone.layers.{}.bn.bias'.format(seq_key), + 'tpu_batch_normalization/gamma': + 'backbone.layers.{}.bn.weight'.format(seq_key), + 'tpu_batch_normalization/moving_mean': + 'backbone.layers.{}.bn.running_mean'.format(seq_key), + 'tpu_batch_normalization/moving_variance': + 'backbone.layers.{}.bn.running_var'.format(seq_key), + 'dense/kernel': + 'head.fc.weight', + 'dense/bias': + 'head.fc.bias' + } + key = mapping['/'.join(seg[2:])] + if name.startswith('e') and 'fc' in key: + v = v[1:] + m[key] = v + return m + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('infile', type=str, help='Path to the ckpt.') + parser.add_argument('outfile', type=str, help='Output file.') + args = parser.parse_args() + assert args.outfile + + outdir = os.path.dirname(os.path.abspath(args.outfile)) + if not os.path.exists(outdir): + os.makedirs(outdir) + weights = read_ckpt(args.infile) + weights = map_key(weights) + torch.save(weights, args.outfile) diff --git a/tools/convert_models/twins2mmcls.py b/tools/convert_models/twins2mmcls.py new file mode 100644 index 00000000000..e0ea04c27d4 --- /dev/null +++ b/tools/convert_models/twins2mmcls.py @@ -0,0 +1,73 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os.path as osp +from collections import OrderedDict + +import mmcv +import torch +from mmcv.runner import CheckpointLoader + + +def convert_twins(args, ckpt): + + new_ckpt = OrderedDict() + + for k, v in list(ckpt.items()): + new_v = v + if k.startswith('head'): + new_k = k.replace('head.', 'head.fc.') + new_ckpt[new_k] = new_v + continue + elif k.startswith('patch_embeds'): + if 'proj.' in k: + new_k = k.replace('proj.', 'projection.') + else: + new_k = k + elif k.startswith('blocks'): + k = k.replace('blocks', 'stages') + # Union + if 'mlp.fc1' in k: + new_k = k.replace('mlp.fc1', 'ffn.layers.0.0') + elif 'mlp.fc2' in k: + new_k = k.replace('mlp.fc2', 'ffn.layers.1') + + else: + new_k = k + new_k = new_k.replace('blocks.', 'layers.') + elif k.startswith('pos_block'): + new_k = k.replace('pos_block', 'position_encodings') + if 'proj.0.' in new_k: + new_k = new_k.replace('proj.0.', 'proj.') + elif k.startswith('norm'): + new_k = k.replace('norm', 'norm_after_stage3') + else: + new_k = k + new_k = 'backbone.' + new_k + new_ckpt[new_k] = new_v + return new_ckpt + + +def main(): + parser = argparse.ArgumentParser( + description='Convert keys in timm pretrained vit models to ' + 'MMClassification style.') + parser.add_argument('src', help='src model path or url') + # The dst path must be a full path of the new checkpoint. + parser.add_argument('dst', help='save path') + args = parser.parse_args() + + checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu') + + if 'state_dict' in checkpoint: + # timm checkpoint + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + + weight = convert_twins(args, state_dict) + mmcv.mkdir_or_exist(osp.dirname(args.dst)) + torch.save(weight, args.dst) + + +if __name__ == '__main__': + main() diff --git a/tools/deployment/pytorch2mlmodel.py b/tools/deployment/pytorch2mlmodel.py new file mode 100644 index 00000000000..814cbe94e75 --- /dev/null +++ b/tools/deployment/pytorch2mlmodel.py @@ -0,0 +1,160 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import os +import os.path as osp +import warnings +from functools import partial + +import mmcv +import numpy as np +import torch +from mmcv.runner import load_checkpoint +from torch import nn + +from mmcls.models import build_classifier + +torch.manual_seed(3) + +try: + import coremltools as ct +except ImportError: + raise ImportError('Please install coremltools to enable output file.') + + +def _demo_mm_inputs(input_shape: tuple, num_classes: int): + """Create a superset of inputs needed to run test or train batches. + + Args: + input_shape (tuple): + input batch dimensions + num_classes (int): + number of semantic classes + """ + (N, C, H, W) = input_shape + rng = np.random.RandomState(0) + imgs = rng.rand(*input_shape) + gt_labels = rng.randint( + low=0, high=num_classes, size=(N, 1)).astype(np.uint8) + mm_inputs = { + 'imgs': torch.FloatTensor(imgs).requires_grad_(False), + 'gt_labels': torch.LongTensor(gt_labels), + } + return mm_inputs + + +def pytorch2mlmodel(model: nn.Module, input_shape: tuple, output_file: str, + add_norm: bool, norm: dict): + """Export Pytorch model to mlmodel format that can be deployed in apple + devices through torch.jit.trace and the coremltools library. + + Optionally, embed the normalization step as a layer to the model. + + Args: + model (nn.Module): Pytorch model we want to export. + input_shape (tuple): Use this input shape to construct + the corresponding dummy input and execute the model. + show (bool): Whether print the computation graph. Default: False. + output_file (string): The path to where we store the output + TorchScript model. + add_norm (bool): Whether to embed the normalization layer to the + output model. + norm (dict): image normalization config for embedding it as a layer + to the output model. + """ + model.cpu().eval() + + num_classes = model.head.num_classes + mm_inputs = _demo_mm_inputs(input_shape, num_classes) + + imgs = mm_inputs.pop('imgs') + img_list = [img[None, :] for img in imgs] + model.forward = partial(model.forward, img_metas={}, return_loss=False) + + with torch.no_grad(): + trace_model = torch.jit.trace(model, img_list[0]) + save_dir, _ = osp.split(output_file) + if save_dir: + os.makedirs(save_dir, exist_ok=True) + + if add_norm: + means, stds = norm.mean, norm.std + if stds.count(stds[0]) != len(stds): + warnings.warn(f'Image std from config is {stds}. However, ' + 'current version of coremltools (5.1) uses a ' + 'global std rather than the channel-specific ' + 'values that torchvision uses. A mean will be ' + 'taken but this might tamper with the resulting ' + 'model\'s predictions. For more details refer ' + 'to the coreml docs on ImageType pre-processing') + scale = np.mean(stds) + else: + scale = stds[0] + + bias = [-mean / scale for mean in means] + image_input = ct.ImageType( + name='input_1', + shape=input_shape, + scale=1 / scale, + bias=bias, + color_layout='RGB', + channel_first=True) + + coreml_model = ct.convert(trace_model, inputs=[image_input]) + coreml_model.save(output_file) + else: + coreml_model = ct.convert( + trace_model, inputs=[ct.TensorType(shape=input_shape)]) + coreml_model.save(output_file) + + print(f'Successfully exported coreml model: {output_file}') + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Convert MMCls to MlModel format for apple devices') + parser.add_argument('config', help='test config file path') + parser.add_argument('--checkpoint', help='checkpoint file', type=str) + parser.add_argument('--output-file', type=str, default='model.mlmodel') + parser.add_argument( + '--shape', + type=int, + nargs='+', + default=[224, 224], + help='input image size') + parser.add_argument( + '--add-norm-layer', + action='store_true', + help='embed normalization layer to deployed model') + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + + if len(args.shape) == 1: + input_shape = (1, 3, args.shape[0], args.shape[0]) + elif len(args.shape) == 2: + input_shape = ( + 1, + 3, + ) + tuple(args.shape) + else: + raise ValueError('invalid input shape') + + cfg = mmcv.Config.fromfile(args.config) + cfg.model.pretrained = None + + # build the model and load checkpoint + classifier = build_classifier(cfg.model) + + if args.checkpoint: + load_checkpoint(classifier, args.checkpoint, map_location='cpu') + + # convert model to mlmodel file + pytorch2mlmodel( + classifier, + input_shape, + output_file=args.output_file, + add_norm=args.add_norm_layer, + norm=cfg.img_norm_cfg) diff --git a/tools/kfold-cross-valid.py b/tools/kfold-cross-valid.py new file mode 100644 index 00000000000..a881316f566 --- /dev/null +++ b/tools/kfold-cross-valid.py @@ -0,0 +1,355 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import copy +import os +import os.path as osp +import time +from datetime import datetime +from pathlib import Path + +import mmcv +import torch +from mmcv import Config, DictAction +from mmcv.runner import get_dist_info, init_dist + +from mmcls import __version__ +from mmcls.apis import init_random_seed, set_random_seed, train_model +from mmcls.datasets import build_dataset +from mmcls.models import build_classifier +from mmcls.utils import collect_env, get_root_logger, load_json_log + +TEST_METRICS = ('precision', 'recall', 'f1_score', 'support', 'mAP', 'CP', + 'CR', 'CF1', 'OP', 'OR', 'OF1', 'accuracy') + +prog_description = """K-Fold cross-validation. + +To start a 5-fold cross-validation experiment: + python tools/kfold-cross-valid.py $CONFIG --num-splits 5 + +To resume a 5-fold cross-validation from an interrupted experiment: + python tools/kfold-cross-valid.py $CONFIG --num-splits 5 --resume-from work_dirs/fold2/latest.pth + +To summarize a 5-fold cross-validation: + python tools/kfold-cross-valid.py $CONFIG --num-splits 5 --summary +""" # noqa: E501 + + +def parse_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=prog_description) + parser.add_argument('config', help='train config file path') + parser.add_argument( + '--num-splits', type=int, help='The number of all folds.') + parser.add_argument( + '--fold', + type=int, + help='The fold used to do validation. ' + 'If specify, only do an experiment of the specified fold.') + parser.add_argument( + '--summary', + action='store_true', + help='Summarize the k-fold cross-validation results.') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--resume-from', help='the checkpoint file to resume from') + parser.add_argument( + '--no-validate', + action='store_true', + help='whether not to evaluate the checkpoint during training') + group_gpus = parser.add_mutually_exclusive_group() + group_gpus.add_argument('--device', help='device used for training') + group_gpus.add_argument( + '--gpus', + type=int, + help='number of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='ids of gpus to use ' + '(only applicable to non-distributed training)') + parser.add_argument('--seed', type=int, default=None, help='random seed') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + return args + + +def copy_config(old_cfg): + """deepcopy a Config object.""" + new_cfg = Config() + _cfg_dict = copy.deepcopy(old_cfg._cfg_dict) + _filename = copy.deepcopy(old_cfg._filename) + _text = copy.deepcopy(old_cfg._text) + super(Config, new_cfg).__setattr__('_cfg_dict', _cfg_dict) + super(Config, new_cfg).__setattr__('_filename', _filename) + super(Config, new_cfg).__setattr__('_text', _text) + return new_cfg + + +def train_single_fold(args, cfg, fold, distributed, seed): + # create the work_dir for the fold + work_dir = osp.join(cfg.work_dir, f'fold{fold}') + cfg.work_dir = work_dir + + # create work_dir + mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) + + # wrap the dataset cfg + train_dataset = dict( + type='KFoldDataset', + fold=fold, + dataset=cfg.data.train, + num_splits=args.num_splits, + seed=seed, + ) + val_dataset = dict( + type='KFoldDataset', + fold=fold, + # Use the same dataset with training. + dataset=copy.deepcopy(cfg.data.train), + num_splits=args.num_splits, + seed=seed, + test_mode=True, + ) + val_dataset['dataset']['pipeline'] = cfg.data.val.pipeline + cfg.data.train = train_dataset + cfg.data.val = val_dataset + cfg.data.test = val_dataset + + # dump config + stem, suffix = osp.basename(args.config).rsplit('.', 1) + cfg.dump(osp.join(cfg.work_dir, f'{stem}_fold{fold}.{suffix}')) + # init the logger before other steps + timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + log_file = osp.join(cfg.work_dir, f'{timestamp}.log') + logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) + + # init the meta dict to record some important information such as + # environment info and seed, which will be logged + meta = dict() + # log env info + env_info_dict = collect_env() + env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) + dash_line = '-' * 60 + '\n' + logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) + meta['env_info'] = env_info + + # log some basic info + logger.info(f'Distributed training: {distributed}') + logger.info(f'Config:\n{cfg.pretty_text}') + logger.info( + f'-------- Cross-validation: [{fold+1}/{args.num_splits}] -------- ') + + # set random seeds + # Use different seed in different folds + logger.info(f'Set random seed to {seed + fold}, ' + f'deterministic: {args.deterministic}') + set_random_seed(seed + fold, deterministic=args.deterministic) + cfg.seed = seed + fold + meta['seed'] = seed + fold + + model = build_classifier(cfg.model) + model.init_weights() + + datasets = [build_dataset(cfg.data.train)] + if len(cfg.workflow) == 2: + val_dataset = copy.deepcopy(cfg.data.val) + val_dataset.pipeline = cfg.data.train.pipeline + datasets.append(build_dataset(val_dataset)) + meta.update( + dict( + mmcls_version=__version__, + config=cfg.pretty_text, + CLASSES=datasets[0].CLASSES, + kfold=dict(fold=fold, num_splits=args.num_splits))) + # add an attribute for visualization convenience + train_model( + model, + datasets, + cfg, + distributed=distributed, + validate=(not args.no_validate), + timestamp=timestamp, + device='cpu' if args.device == 'cpu' else 'cuda', + meta=meta) + + +def summary(args, cfg): + summary = dict() + for fold in range(args.num_splits): + work_dir = Path(cfg.work_dir) / f'fold{fold}' + + # Find the latest training log + log_files = list(work_dir.glob('*.log.json')) + if len(log_files) == 0: + continue + log_file = sorted(log_files)[-1] + + date = datetime.fromtimestamp(log_file.lstat().st_mtime) + summary[fold] = {'date': date.strftime('%Y-%m-%d %H:%M:%S')} + + # Find the latest eval log + json_log = load_json_log(log_file) + epochs = sorted(list(json_log.keys())) + eval_log = {} + + def is_metric_key(key): + for metric in TEST_METRICS: + if metric in key: + return True + return False + + for epoch in epochs[::-1]: + if any(is_metric_key(k) for k in json_log[epoch].keys()): + eval_log = json_log[epoch] + break + + summary[fold]['epoch'] = epoch + summary[fold]['metric'] = { + k: v[0] # the value is a list with only one item. + for k, v in eval_log.items() if is_metric_key(k) + } + show_summary(args, summary) + + +def show_summary(args, summary_data): + try: + from rich.console import Console + from rich.table import Table + except ImportError: + raise ImportError('Please run `pip install rich` to install ' + 'package `rich` to draw the table.') + + console = Console() + table = Table(title=f'{args.num_splits}-fold Cross-validation Summary') + table.add_column('Fold') + metrics = summary_data[0]['metric'].keys() + for metric in metrics: + table.add_column(metric) + table.add_column('Epoch') + table.add_column('Date') + + for fold in range(args.num_splits): + row = [f'{fold+1}'] + if fold not in summary_data: + table.add_row(*row) + continue + for metric in metrics: + metric_value = summary_data[fold]['metric'].get(metric, '') + + def format_value(value): + if isinstance(value, float): + return f'{value:.2f}' + if isinstance(value, (list, tuple)): + return str([format_value(i) for i in value]) + else: + return str(value) + + row.append(format_value(metric_value)) + row.append(str(summary_data[fold]['epoch'])) + row.append(summary_data[fold]['date']) + table.add_row(*row) + + console.print(table) + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + + if args.summary: + summary(args, cfg) + return + + # resume from the previous experiment + if args.resume_from is not None: + cfg.resume_from = args.resume_from + resume_kfold = torch.load(cfg.resume_from).get('meta', + {}).get('kfold', None) + if resume_kfold is None: + raise RuntimeError( + 'No "meta" key in checkpoints or no "kfold" in the meta dict. ' + 'Please check if the resume checkpoint from a k-fold ' + 'cross-valid experiment.') + resume_fold = resume_kfold['fold'] + assert args.num_splits == resume_kfold['num_splits'] + else: + resume_fold = 0 + + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids + else: + cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) + + # init a unified random seed + seed = init_random_seed(args.seed) + + # create work_dir + mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) + + if args.fold is not None: + folds = [args.fold] + else: + folds = range(resume_fold, args.num_splits) + + for fold in folds: + cfg_ = copy_config(cfg) + if fold != resume_fold: + cfg_.resume_from = None + train_single_fold(args, cfg_, fold, distributed, seed) + + if args.fold is None: + summary(args, cfg) + + +if __name__ == '__main__': + main() diff --git a/tools/test.py b/tools/test.py index 1eafc20470e..4851b6cbea4 100644 --- a/tools/test.py +++ b/tools/test.py @@ -14,6 +14,7 @@ from mmcls.apis import multi_gpu_test, single_gpu_test from mmcls.datasets import build_dataloader, build_dataset from mmcls.models import build_classifier +from mmcls.utils import setup_multi_processes # TODO import `wrap_fp16_model` from mmcv and delete them from mmcls try: @@ -87,17 +88,20 @@ def parse_args(): action=DictAction, help='custom options for show_result. key-value pair in xxx=yyy.' 'Check available options in `model.show_result`.') + parser.add_argument( + '--device', default=None, help='device used for testing. (Deprecated)') + parser.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='ids of gpus to use ' + '(only applicable to non-distributed testing)') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) - parser.add_argument( - '--device', - choices=['cpu', 'cuda'], - default='cuda', - help='device used for testing') args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) @@ -110,6 +114,15 @@ def parse_args(): warnings.warn('--options is deprecated in favor of --cfg-options') args.cfg_options = args.options + if args.device: + warnings.warn( + '--device is deprecated. To use cpu to test, please ' + 'refers to https://mmclassification.readthedocs.io/en/latest/' + 'getting_started.html#inference-with-pretrained-models') + + assert args.metrics or args.out, \ + 'Please specify at least one of output path and evaluation metrics.' + return args @@ -119,18 +132,29 @@ def main(): cfg = mmcv.Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) + + # set multi-process settings + setup_multi_processes(cfg) + # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True - assert args.metrics or args.out, \ - 'Please specify at least one of output path and evaluation metrics.' + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids + else: + cfg.gpu_ids = range(1) # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False + if len(cfg.gpu_ids) > 1: + warnings.warn(f'The gpu-ids is reset from {cfg.gpu_ids} to ' + f'{cfg.gpu_ids[0:1]} to avoid potential error in ' + 'non-distribute testing time.') + cfg.gpu_ids = cfg.gpu_ids[0:1] else: distributed = True init_dist(args.launcher, **cfg.dist_params) @@ -166,7 +190,11 @@ def main(): if args.device == 'cpu': model = model.cpu() else: - model = MMDataParallel(model, device_ids=[0]) + model = MMDataParallel(model, device_ids=cfg.gpu_ids) + if not model.device_ids: + assert mmcv.digit_version(mmcv.__version__) >= (1, 4, 4), \ + 'To test with CPU, please confirm your mmcv version ' \ + 'is not lower than v1.4.4' model.CLASSES = CLASSES show_kwargs = {} if args.show_options is None else args.show_options outputs = single_gpu_test(model, data_loader, args.show, args.show_dir, diff --git a/tools/train.py b/tools/train.py index 2880f8c77e2..1a574563326 100644 --- a/tools/train.py +++ b/tools/train.py @@ -15,7 +15,7 @@ from mmcls.apis import init_random_seed, set_random_seed, train_model from mmcls.datasets import build_dataset from mmcls.models import build_classifier -from mmcls.utils import collect_env, get_root_logger +from mmcls.utils import collect_env, get_root_logger, setup_multi_processes def parse_args(): @@ -29,7 +29,8 @@ def parse_args(): action='store_true', help='whether not to evaluate the checkpoint during training') group_gpus = parser.add_mutually_exclusive_group() - group_gpus.add_argument('--device', help='device used for training') + group_gpus.add_argument( + '--device', help='device used for training. (Deprecated)') group_gpus.add_argument( '--gpus', type=int, @@ -81,6 +82,12 @@ def parse_args(): warnings.warn('--options is deprecated in favor of --cfg-options') args.cfg_options = args.options + if args.device: + warnings.warn( + '--device is deprecated. To use cpu to train, please ' + 'refers to https://mmclassification.readthedocs.io/en/latest/' + 'getting_started.html#train-a-model') + return args @@ -90,6 +97,10 @@ def main(): cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) + + # set multi-process settings + setup_multi_processes(cfg) + # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True