diff --git a/.circleci/test.yml b/.circleci/test.yml
index 3984767a12..efa9342303 100644
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@@ -44,7 +44,6 @@ jobs:
- run:
name: Install Libraries
command: |
- sudo add-apt-repository ppa:savoury1/ffmpeg4
sudo apt-get update
sudo apt-get upgrade
sudo apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libturbojpeg pkg-config
@@ -64,7 +63,7 @@ jobs:
command: |
pip install git+ssh://git@github.com/open-mmlab/mmengine.git@main
pip install -U openmim
- mim install 'mmcv >= 2.0.0rc1'
+ mim install 'mmcv >= 2.0.0'
pip install git+ssh://git@github.com/open-mmlab/mmdetection.git@dev-3.x
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install -r requirements.txt
@@ -99,10 +98,10 @@ jobs:
type: string
cuda:
type: enum
- enum: ["10.1", "10.2", "11.1"]
+ enum: ["11.0"]
cudnn:
type: integer
- default: 7
+ default: 8
machine:
image: ubuntu-2004-cuda-11.4:202110-01
# docker_layer_caching: true
@@ -115,33 +114,24 @@ jobs:
docker build .circleci/docker -t mmaction:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
docker run --gpus all -t -d -v /home/circleci/project:/mmaction -w /mmaction --name mmaction mmaction:gpu
docker exec mmaction apt-get update
+ docker exec mmaction pip install "numpy==1.23"
docker exec mmaction apt-get upgrade -y
docker exec mmaction apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libturbojpeg pkg-config
docker exec mmaction apt-get install -y libavdevice-dev libavfilter-dev libopus-dev libvpx-dev libsrtp2-dev libsndfile1
+ - run:
+ name: Install PytorchVideo and timm
+ command: |
+ docker exec mmaction pip install timm
+ docker exec mmaction python -m pip install pytorchvideo
- run:
name: Install mmaction dependencies
command: |
docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmengine.git@main
docker exec mmaction pip install -U openmim
- docker exec mmaction mim install 'mmcv >= 2.0.0rc1'
+ docker exec mmaction mim install 'mmcv >= 2.0.0'
docker exec mmaction pip install git+https://git@github.com/open-mmlab/mmdetection.git@dev-3.x
docker exec mmaction pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
docker exec mmaction pip install -r requirements.txt
- - when:
- condition:
- equal: [ "1.8.1", << parameters.torch >> ]
- steps:
- - run: docker exec mmaction pip install timm
- - when:
- condition:
- equal: [ "1.6.0", << parameters.torch >> ]
- steps:
- - run: docker exec mmaction pip install timm==0.6.7
- - when:
- condition:
- equal: [ "10.2", << parameters.cuda >> ]
- steps:
- - run: docker exec mmaction python -m pip install pytorchvideo
- run:
name: Build and install
command: |
@@ -160,7 +150,7 @@ workflows:
branches:
ignore:
- dev-1.x
- - 1.x
+ - main
pr_stage_test:
when:
not:
@@ -172,7 +162,7 @@ workflows:
branches:
ignore:
- dev-1.x
- - 1.x
+ - main
- build_cpu:
name: minimum_version_cpu
torch: 1.6.0
@@ -196,7 +186,7 @@ workflows:
torch: 1.8.1
# Use double quotation mark to explicitly specify its type
# as string instead of number
- cuda: "10.2"
+ cuda: "11.0"
requires:
- hold
merge_stage_test:
@@ -206,11 +196,12 @@ workflows:
jobs:
- build_cuda:
name: minimum_version_gpu
- torch: 1.6.0
+ torch: 1.7.1
# Use double quotation mark to explicitly specify its type
# as string instead of number
- cuda: "10.1"
+ cuda: "11.0"
filters:
branches:
only:
- dev-1.x
+ - main
diff --git a/.github/ISSUE_TEMPLATE/1-bug-report.yml b/.github/ISSUE_TEMPLATE/1-bug-report.yml
new file mode 100644
index 0000000000..809a23e3c9
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -0,0 +1,100 @@
+name: "🐞 Bug Report"
+description: "Create a report to help us reproduce and fix the bug"
+labels: Bug
+title: "[Bug] "
+
+body:
+ - type: markdown
+ attributes:
+ value: |
+ If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [here](https://github.com/open-mmlab/mmaction2/pulls)!
+ If this issue is about installing MMCV, please file an issue at [MMCV](https://github.com/open-mmlab/mmcv/issues/new/choose).
+ If you need our help, please fill in as much of the following form as you're able.
+
+ **The less clear the description, the longer it will take to solve it.**
+
+ - type: dropdown
+ id: version
+ attributes:
+ label: Branch
+ description: Which branch/version are you using?
+ options:
+ - master branch (0.x version, such as `v0.10.0`, or `dev` branch)
+ - 1.x branch (1.x version, such as `v1.0.0rc2`, or `dev-1.x` branch)
+ validations:
+ required: true
+
+ - type: checkboxes
+ attributes:
+ label: Prerequisite
+ description: Please check the following items before creating a new issue.
+ options:
+ - label: I have searched [Issues](https://github.com/open-mmlab/mmaction2/issues) and [Discussions](https://github.com/open-mmlab/mmaction2/discussions) but cannot get the expected help.
+ required: true
+ - label: I have read the [documentation](https://mmaction2.readthedocs.io/en/latest/) but cannot get the expected help.
+ required: true
+ - label: The bug has not been fixed in the [latest version](https://github.com/open-mmlab/mmaction2).
+ required: true
+
+ - type: textarea
+ attributes:
+ label: Environment
+ description: |
+ Please run `python mmaction2/utils/collect_env.py` to collect necessary environment information and copy-paste it here.
+ You may add additional information that may be helpful for locating the problem, such as
+ - How you installed PyTorch \[e.g., pip, conda, source\]
+ - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
+ validations:
+ required: true
+
+ - type: textarea
+ id: description
+ validations:
+ required: true
+ attributes:
+ label: Describe the bug
+ description: |
+ Please provide a clear and concise description of what the bug is.
+ Preferably a simple and minimal code snippet is provided below, so that we can reproduce the error by running the code.
+ placeholder: |
+ A clear and concise description of what the bug is.
+
+ - type: textarea
+ attributes:
+ label: Reproduces the problem - code sample
+ description: |
+ Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
+ Did you make any modifications on the code or config? Are you clear about what you have modified?
+ placeholder: |
+ ```python
+ # Sample code to reproduce the problem
+ ```
+
+ - type: textarea
+ attributes:
+ label: Reproduces the problem - command or script
+ description: |
+ What command or script did you run?
+ placeholder: |
+ ```shell
+ The command or script you run.
+ ```
+
+ - type: textarea
+ attributes:
+ label: Reproduces the problem - error message
+ description: |
+ Please provide the error message or logs you got, with the full traceback.
+ placeholder: |
+ ```
+ The error message or logs you got, with the full traceback.
+ ```
+
+ - type: textarea
+ attributes:
+ label: Additional information
+ description: Tell us anything else you think we should know.
+ placeholder: |
+ 1. What's your expected result?
+ 2. What dataset did you use?
+ 3. What do you think might be the reason?
diff --git a/.github/ISSUE_TEMPLATE/2-feature-request.yml b/.github/ISSUE_TEMPLATE/2-feature-request.yml
new file mode 100644
index 0000000000..c32c477133
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/2-feature-request.yml
@@ -0,0 +1,33 @@
+name: 🚀 Feature Request
+description: Suggest an idea for this project
+labels: [Feature]
+title: "[Feature] "
+
+body:
+ - type: markdown
+ attributes:
+ value: |
+ We strongly appreciate you creating a PR to implete this feature [here](https://github.com/open-mmlab/mmaction2/pulls)!
+ If you need our help, please fill in as much of the following form as you're able.
+
+ **The less clear the description, the longer it will take to solve it.**
+
+ - type: textarea
+ attributes:
+ label: What is the problem this feature will solve?
+ placeholder: |
+ E.g., It is inconvenient when \[....\].
+ validations:
+ required: true
+
+ - type: textarea
+ attributes:
+ label: What is the feature?
+ validations:
+ required: true
+
+ - type: textarea
+ attributes:
+ label: What alternatives have you considered?
+ description: |
+ Add any other context or screenshots about the feature request here.
diff --git a/.github/ISSUE_TEMPLATE/3-documentation.yml b/.github/ISSUE_TEMPLATE/3-documentation.yml
new file mode 100644
index 0000000000..f47353edd4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/3-documentation.yml
@@ -0,0 +1,23 @@
+name: 📚 Documentation Issue
+description: Report an issue related to https://mmaction2.readthedocs.io/en/latest/.
+labels: "Documentation"
+title: "[Docs] "
+
+body:
+- type: textarea
+ attributes:
+ label: The doc issue
+ description: >
+ A clear and concise description of what content in https://mmaction2.readthedocs.io/en/latest/ is an issue.
+ validations:
+ required: true
+
+- type: textarea
+ attributes:
+ label: Suggest a potential alternative/fix
+ description: >
+ Tell us how we could improve the documentation in this regard.
+- type: markdown
+ attributes:
+ value: >
+ Thanks for contributing 🎉!
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index a772220430..d41e7bd45f 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,9 +1,12 @@
blank_issues_enabled: false
contact_links:
- - name: Common Issues
- url: https://mmaction2.readthedocs.io/en/latest/faq.html
- about: Check if your issue already has solutions
- - name: MMAction2 Documentation
- url: https://mmaction2.readthedocs.io/en/latest/
+ - name: 📚 MMAction2 Documentation (官方文档)
+ url: https://mmaction2.readthedocs.io/en/latest
about: Check if your question is answered in docs
+ - name: 💬 Forum (寻求帮助)
+ url: https://github.com/open-mmlab/mmaction2/discussions
+ about: Ask general usage questions and discuss with other MMAction2 community members
+ - name: 🌐 Explore OpenMMLab (官网)
+ url: https://openmmlab.com/
+ about: Get know more about OpenMMLab
diff --git a/.github/ISSUE_TEMPLATE/error-report.md b/.github/ISSUE_TEMPLATE/error-report.md
deleted file mode 100644
index 60206eaba2..0000000000
--- a/.github/ISSUE_TEMPLATE/error-report.md
+++ /dev/null
@@ -1,56 +0,0 @@
----
-name: Error report
-about: Create a report to help us improve
-title: ''
-labels: ''
-assignees: ''
----
-
-Thanks for your error report and we appreciate it a lot.
-If you feel we have helped you, give us a STAR! :satisfied:
-
-**Checklist**
-
-1. I have searched related issues but cannot get the expected help.
-2. The bug has not been fixed in the latest version.
-
-**Describe the bug**
-
-A clear and concise description of what the bug is.
-
-**Reproduction**
-
-- What command or script did you run?
-
-```
-A placeholder for the command.
-```
-
-- What config did you run?
-
-```
-A placeholder for the config.
-```
-
-- Did you make any modifications on the code or config? Did you understand what you have modified?
-- What dataset did you use?
-
-**Environment**
-
-1. Please run `PYTHONPATH=${PWD}:$PYTHONPATH python mmaction/utils/collect_env.py` to collect necessary environment information and paste it here.
-2. You may add addition that may be helpful for locating the problem, such as
-
-- How you installed PyTorch \[e.g., pip, conda, source\]
-- Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
-
-**Error traceback**
-
-If applicable, paste the error traceback here.
-
-```
-A placeholder for traceback.
-```
-
-**Bug fix**
-
-If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
deleted file mode 100644
index 81ce7f60be..0000000000
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ /dev/null
@@ -1,33 +0,0 @@
----
-name: Feature request
-about: Suggest an idea for this project
-title: ''
-labels: ''
-assignees: ''
----
-
-Thanks for your feature request and we will review and plan for it when necessary.
-If you feel we have helped you, give us a STAR! :satisfied:
-
-**Steps**
-
-1. Check if the feature has been requested in the [meta issue](https://github.com/open-mmlab/mmaction2/issues/19), and if so, click thumb up button.
-2. Post the feature request in the [meta issue](https://github.com/open-mmlab/mmaction2/issues/19), if it is new.
-
-**Describe the feature**
-
-**Motivation**
-
-A clear and concise description of the motivation of the feature.
-
-1. Ex1. It is inconvenient when \[....\].
-2. Ex2. There is a recent paper \[....\], which is very helpful for \[....\].
-
-**Related resources**
-
-If there is an official code released or third-party implementations, please also provide the information here, which would be very helpful.
-
-**Additional context**
-
-Add any other context or screenshots about the feature request here.
-If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated.
diff --git a/.github/ISSUE_TEMPLATE/general_questions.md b/.github/ISSUE_TEMPLATE/general_questions.md
deleted file mode 100644
index 5aa583cb1c..0000000000
--- a/.github/ISSUE_TEMPLATE/general_questions.md
+++ /dev/null
@@ -1,14 +0,0 @@
----
-name: General questions
-about: Ask general questions to get help
-title: ''
-labels: ''
-assignees: ''
----
-
-Before raising a question, you may need to check the following listed items.
-
-**Checklist**
-
-1. I have searched related issues but cannot get the expected help.
-2. I have read the [FAQ documentation](https://mmaction2.readthedocs.io/en/latest/faq.html) but cannot get the expected help.
diff --git a/.github/ISSUE_TEMPLATE/reimplementation_questions.md b/.github/ISSUE_TEMPLATE/reimplementation_questions.md
deleted file mode 100644
index 683e5d6fa4..0000000000
--- a/.github/ISSUE_TEMPLATE/reimplementation_questions.md
+++ /dev/null
@@ -1,70 +0,0 @@
----
-name: Reimplementation Questions
-about: Ask about questions during model reimplementation
-title: ''
-labels: reimplementation
-assignees: ''
----
-
-If you feel we have helped you, give us a STAR! :satisfied:
-
-**Notice**
-
-There are several common situations in the reimplementation issues as below
-
-1. Reimplement a model in the model zoo using the provided configs.
-2. Reimplement a model in the model zoo on other dataset (e.g., custom datasets).
-3. Reimplement a custom model but all the components are implemented in MMAction2.
-4. Reimplement a custom model with new modules implemented by yourself.
-
-There are several things to do for different cases as below.
-
-- For case 1 & 3, please follow the steps in the following sections thus we could help to quick identify the issue.
-- For case 2 & 4, please understand that we are not able to do much help here because we usually do not know the full code and the users should be responsible to the code they write.
-- One suggestion for case 2 & 4 is that the users should first check whether the bug lies in the self-implemented code or the original code. For example, users can first make sure that the same model runs well on supported datasets. If you still need help, please describe what you have done and what you obtain in the issue, and follow the steps in the following sections and try as clear as possible so that we can better help you.
-
-**Checklist**
-
-1. I have searched related issues but cannot get the expected help.
-2. The issue has not been fixed in the latest version.
-
-**Describe the issue**
-
-A clear and concise description of what the problem you meet and what have you done.
-
-**Reproduction**
-
-- What command or script did you run?
-
-```
-A placeholder for the command.
-```
-
-- What config dir you run?
-
-```
-A placeholder for the config.
-```
-
-- Did you make any modifications on the code or config? Did you understand what you have modified?
-- What dataset did you use?
-
-**Environment**
-
-1. Please run `PYTHONPATH=${PWD}:$PYTHONPATH python mmaction/utils/collect_env.py` to collect necessary environment information and paste it here.
-2. You may add addition that may be helpful for locating the problem, such as
-
-- How you installed PyTorch \[e.g., pip, conda, source\]
-- Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
-
-**Results**
-
-If applicable, paste the related results here, e.g., what you expect and what you get.
-
-```
-A placeholder for results comparison
-```
-
-**Issue fix**
-
-If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!
diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml
index 8c9862d049..60df0a1245 100644
--- a/.github/workflows/merge_stage_test.yml
+++ b/.github/workflows/merge_stage_test.yml
@@ -18,7 +18,7 @@ concurrency:
jobs:
build_cpu_py:
- runs-on: ubuntu-18.04
+ runs-on: ubuntu-22.04
strategy:
matrix:
python-version: [3.8, 3.9]
@@ -27,16 +27,15 @@ jobs:
- torch: 1.8.1
torchvision: 0.9.1
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Upgrade pip
run: pip install pip --upgrade
- name: Install Libraries
run: |
- sudo add-apt-repository ppa:savoury1/ffmpeg4
sudo apt-get update
sudo apt-get upgrade
sudo apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libxrender-dev libturbojpeg pkg-config
@@ -50,13 +49,13 @@ jobs:
- name: Install unittest dependencies
run: pip install -r requirements.txt
- name: Install PyTorch
- run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+ run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
- name: Install MMEngine
run: pip install git+https://github.com/open-mmlab/mmengine.git@main
- name: Install MMCV
run: |
pip install -U openmim
- mim install 'mmcv >= 2.0.0rc1'
+ mim install 'mmcv >= 2.0.0'
- name: Install MMDet
run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
- name: Install MMCls
@@ -65,7 +64,7 @@ jobs:
run: pip install pytorchvideo
if: ${{matrix.torchvision == '0.10.0'}}
- name: Install timm
- run: python -m pip install timm
+ run: pip install timm
- name: Build and install
run: rm -rf .eggs && pip install -e .
- name: Run unittests and generate coverage report
@@ -75,7 +74,7 @@ jobs:
coverage report -m
build_cpu_pt:
- runs-on: ubuntu-18.04
+ runs-on: ubuntu-22.04
strategy:
matrix:
python-version: [3.7]
@@ -96,9 +95,9 @@ jobs:
- torch: 1.12.1
torchvision: 0.13.1
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Upgrade pip
@@ -110,21 +109,21 @@ jobs:
- name: Install lmdb
run: pip install lmdb
- name: Install timm
- run: python -m pip install timm==0.6.7
+ run: pip install timm==0.6.7
if: ${{matrix.torch == '1.6.0'}}
- name: Install timm
- run: python -m pip install timm
+ run: pip install timm
if: ${{matrix.torch != '1.6.0'}}
- name: Install TurboJpeg lib
run: sudo apt-get install -y libturbojpeg
- name: Install PyTorch
- run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+ run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
- name: Install MMEngine
run: pip install git+https://github.com/open-mmlab/mmengine.git@main
- name: Install MMCV
run: |
pip install -U openmim
- mim install 'mmcv >= 2.0.0rc1'
+ mim install 'mmcv >= 2.0.0'
- name: Install MMDet
run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
- name: Install MMCls
@@ -153,7 +152,7 @@ jobs:
fail_ci_if_error: false
build_cu102:
- runs-on: ubuntu-18.04
+ runs-on: ubuntu-22.04
container:
image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel
strategy:
@@ -163,9 +162,9 @@ jobs:
- torch: 1.8.1
cuda: 10.2
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Upgrade pip
@@ -181,36 +180,37 @@ jobs:
run: |
apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libturbojpeg libsndfile1 libsm6 libxrender-dev libxext6
- name: Install librosa and soundfile
- run: python -m pip install librosa soundfile
+ run: pip install librosa soundfile
- name: Install lmdb
- run: python -m pip install lmdb
+ run: pip install lmdb
- name: Install mmaction dependencies
run: |
pip install git+https://github.com/open-mmlab/mmengine.git@main
pip install -U openmim
- mim install 'mmcv >= 2.0.0rc1'
+ mim install 'mmcv >= 2.0.0'
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install -r requirements.txt
- name: Install PytorchVideo
- run: python -m pip install pytorchvideo
+ run: pip install pytorchvideo
if: ${{matrix.cuda == '10.2'}}
- name: Build and install
run: |
- python setup.py check -m -s
- TORCH_CUDA_ARCH_LIST=7.0 pip install -e .
+ pip install -e .
build_windows:
- runs-on: ${{ matrix.os }}
+ runs-on: windows-2022
strategy:
matrix:
os: [windows-2022]
python: [3.7]
platform: [cpu, cu111]
+ torch: [1.8.1]
+ torchvision: [0.9.1]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python }}
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python }}
- name: Upgrade pip
@@ -220,12 +220,12 @@ jobs:
- name: Install lmdb
run: pip install lmdb
- name: Install PyTorch
- run: pip install torch==1.8.1+${{matrix.platform}} torchvision==0.9.1+${{matrix.platform}} -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+ run: pip install torch==${{matrix.torch}}+${{matrix.platform}} torchvision==${{matrix.torchvision}}+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
- name: Install mmaction dependencies
run: |
pip install git+https://github.com/open-mmlab/mmengine.git@main
pip install -U openmim
- mim install 'mmcv >= 2.0.0rc1'
+ mim install 'mmcv >= 2.0.0'
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install -r requirements.txt
diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml
index 74c1145b5c..a8b5c4c7a2 100644
--- a/.github/workflows/pr_stage_test.yml
+++ b/.github/workflows/pr_stage_test.yml
@@ -16,7 +16,7 @@ concurrency:
jobs:
build_cpu:
- runs-on: ubuntu-18.04
+ runs-on: ubuntu-22.04
strategy:
matrix:
python-version: [3.7]
@@ -24,9 +24,9 @@ jobs:
- torch: 1.8.1
torchvision: 0.9.1
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Upgrade pip
@@ -40,13 +40,13 @@ jobs:
- name: Install TurboJpeg lib
run: sudo apt-get install -y libturbojpeg
- name: Install PyTorch
- run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+ run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
- name: Install MMEngine
run: pip install git+https://github.com/open-mmlab/mmengine.git@main
- name: Install MMCV
run: |
pip install -U openmim
- mim install 'mmcv >= 2.0.0rc1'
+ mim install 'mmcv >= 2.0.0'
- name: Install MMDet
run: pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
- name: Install MMCls
@@ -77,9 +77,11 @@ jobs:
fail_ci_if_error: false
build_cu102:
- runs-on: ubuntu-18.04
+ runs-on: ubuntu-22.04
container:
image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel
+ env:
+ MKL_THREADING_LAYER: GNU
strategy:
matrix:
python-version: [3.7]
@@ -87,9 +89,9 @@ jobs:
- torch: 1.8.1
cuda: 10.2
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Upgrade pip
@@ -105,24 +107,23 @@ jobs:
run: |
apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libturbojpeg libsndfile1 libsm6 libxrender-dev libxext6
- name: Install librosa and soundfile
- run: python -m pip install librosa soundfile
+ run: pip install librosa soundfile
- name: Install lmdb
- run: python -m pip install lmdb
+ run: pip install lmdb
- name: Install mmaction dependencies
run: |
pip install git+https://github.com/open-mmlab/mmengine.git@main
pip install -U openmim
- mim install 'mmcv >= 2.0.0rc1'
+ mim install 'mmcv >= 2.0.0'
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install -r requirements.txt
- name: Install PytorchVideo
- run: python -m pip install pytorchvideo
+ run: pip install pytorchvideo
if: ${{matrix.cuda == '10.2'}}
- name: Build and install
run: |
- python setup.py check -m -s
- TORCH_CUDA_ARCH_LIST=7.0 pip install -e .
+ pip install -e . -v
- name: Run unittests and generate coverage report
run: |
coverage run --branch --source mmaction -m pytest tests/ -k 'not timm'
@@ -130,16 +131,18 @@ jobs:
coverage report -m
build_windows:
- runs-on: ${{ matrix.os }}
+ runs-on: windows-2022
strategy:
matrix:
os: [windows-2022]
- python: [3.7]
+ python-version: [3.7]
+ torch: [1.8.1]
+ torchvision: [0.9.1]
platform: [cpu, cu111]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python }}
- uses: actions/setup-python@v2
+ uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python }}
- name: Upgrade pip
@@ -151,14 +154,14 @@ jobs:
- name: Install lmdb
run: pip install lmdb
- name: Install PyTorch
- run: pip install torch==1.8.1+${{matrix.platform}} torchvision==0.9.1+${{matrix.platform}} -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+ run: pip install torch==${{matrix.torch}}+${{matrix.platform}} torchvision==${{matrix.torchvision}}+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
- name: Install timm
run: python -m pip install timm
- name: Install mmaction dependencies
run: |
pip install git+https://github.com/open-mmlab/mmengine.git@main
pip install -U openmim
- mim install 'mmcv >= 2.0.0rc1'
+ mim install 'mmcv >= 2.0.0'
pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
pip install -r requirements.txt
@@ -166,7 +169,7 @@ jobs:
run: python -m pip install pytorchvideo
- name: Build and install
run: |
- pip install -e .
+ pip install -e . -v
- name: Run unittests and generate coverage report
run: |
pytest tests/
diff --git a/.gitignore b/.gitignore
index b2c1be8fa6..1d637fa156 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,7 +65,7 @@ instance/
.scrapy
# Sphinx documentation
-docs/_build/
+docs/*/_build/
# PyBuilder
target/
@@ -113,6 +113,8 @@ venv.bak/
*.log.json
benchlist.txt
work_dirs/
+/projects/*/work_dirs
+/projects/*/data
.DS_Store
# Pytorch
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 6cfbf5d310..070c61832b 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,6 +1,7 @@
version: 2
-formats: all
+formats:
+ - epub
python:
version: 3.7
diff --git a/README.md b/README.md
index b380d548de..064d4526f5 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
-[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/1.x/)
+[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/latest/)
[![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions)
[![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2)
[![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/)
@@ -25,42 +25,84 @@
[![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
[![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
-[📘Documentation](https://mmaction2.readthedocs.io/en/1.x/) |
-[🛠️Installation](https://mmaction2.readthedocs.io/en/1.x/get_started.html) |
-[👀Model Zoo](https://mmaction2.readthedocs.io/en/1.x/modelzoo.html) |
-[🆕Update News](https://mmaction2.readthedocs.io/en/1.x/notes/changelog.html) |
+[📘Documentation](https://mmaction2.readthedocs.io/en/latest/) |
+[🛠️Installation](https://mmaction2.readthedocs.io/en/latest/get_started.html) |
+[👀Model Zoo](https://mmaction2.readthedocs.io/en/latest/modelzoo.html) |
+[🆕Update News](https://mmaction2.readthedocs.io/en/latest/notes/changelog.html) |
[🚀Ongoing Projects](https://github.com/open-mmlab/mmaction2/projects) |
[🤔Reporting Issues](https://github.com/open-mmlab/mmaction2/issues/new/choose)
-## Introduction
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-MMAction2 is an open-source toolbox for video understanding based on PyTorch.
-It is a part of the [OpenMMLab](http://openmmlab.org/) project.
+English | [简体中文](/README_zh-CN.md)
+
+## 📄 Table of Contents
+
+- [📄 Table of Contents](#-table-of-contents)
+- [🥳 🚀 What's New](#--whats-new-)
+- [📖 Introduction](#-introduction-)
+- [🎁 Major Features](#-major-features-)
+- [🛠️ Installation](#️-installation-)
+- [👀 Model Zoo](#-model-zoo-)
+- [👨🏫 Get Started](#-get-started-)
+- [🎫 License](#-license-)
+- [🖊️ Citation](#️-citation-)
+- [🙌 Contributing](#-contributing-)
+- [🤝 Acknowledgement](#-acknowledgement-)
+- [🏗️ Projects in OpenMMLab](#️-projects-in-openmmlab-)
+
+## 🥳 🚀 What's New [🔝](#-table-of-contents)
-The 1.x branch works with **PyTorch 1.6+**.
+**The default branch has been switched to `1.x` from `master`, and we encourage users to migrate to the latest version with more supported models, stronger pre-training checkpoints and simpler coding. Please refer to [Migration Guide](https://mmaction2.readthedocs.io/en/latest/migration.html) for more details.**
+
+**Release (2023.04.06)**: v1.0.0 with the following new features:
+
+- Support RGB-PoseC3D(CVPR'2022).
+- Support training UniFormer V2(Arxiv'2022).
+- Support MSG3D(CVPR'2020) and CTRGCN(CVPR'2021) in projects.
+- Refactor and provide more user-friendly documentation.
+
+## 📖 Introduction [🔝](#-table-of-contents)
+
+MMAction2 is an open-source toolbox for video understanding based on PyTorch.
+It is a part of the [OpenMMLab](http://openmmlab.com/) project.
-
-
-
Action Recognition Results on Kinetics-400
-
-
-
-
Skeleton-based Action Recognition Results on NTU-RGB+D-120
-
+
+
+
Action Recognition on Kinetics-400 (left) and Skeleton-based Action Recognition on NTU-RGB+D-120 (right)
+
Skeleton-based Spatio-Temporal Action Detection and Action Recognition Results on Kinetics-400
-
+
Spatio-Temporal Action Detection Results on AVA-2.1
-## Major Features
+## 🎁 Major Features [🔝](#-table-of-contents)
- **Modular design**: We decompose a video understanding framework into different components. One can easily construct a customized video understanding framework by combining different modules.
@@ -68,19 +110,38 @@ The 1.x branch works with **PyTorch 1.6+**.
- **Well tested and documented**: We provide detailed documentation and API reference, as well as unit tests.
-## What's New
+## 🛠️ Installation [🔝](#-table-of-contents)
+
+MMAction2 depends on [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine), [MMDetection](https://github.com/open-mmlab/mmdetection) (optional, for spatial-temporal detection tasks) and [MMPose](https://github.com/open-mmlab/mmpose) (optional, for skeleton based tasks).
-**Release (2022.02.10)**: v1.0.0rc3 with the following new features:
+Please refer to [install.md](https://mmaction2.readthedocs.io/en/latest/get_started.html) for detailed instructions.
+
+
+Quick instructions
+
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate open-mmlab
+conda install pytorch torchvision -c pytorch # This command will automatically install the latest version PyTorch and cudatoolkit, please check whether they match your environment.
+pip install -U openmim
+mim install mmengine 'mmcv>=2.0.0'
+mim install "mmdet>=3.0.0" # optional
+mim install "mmpose>=1.0.0" # optional
+git clone https://github.com/open-mmlab/mmaction2.git
+cd mmaction2
+git checkout 1.x
+pip3 install -e .
+```
-- Support Action Recognition model UniFormer V1(ICLR'2022), UniFormer V2(Arxiv'2022).
-- Support training MViT V2(CVPR'2022), and MaskFeat(CVPR'2022) fine-tuning.
-- Add a new handy interface for inference MMAction2 models ([demo](https://github.com/open-mmlab/mmaction2/blob/dev-1.x/demo/README.md#inferencer))
+
-## Installation
+## 👀 Model Zoo [🔝](#-table-of-contents)
-Please refer to [install.md](https://mmaction2.readthedocs.io/en/1.x/get_started.html) for more detailed instructions.
+Results and models are available in the [model zoo](https://mmaction2.readthedocs.io/en/latest/modelzoo.html).
-## Supported Methods
+
+
+Supported model
@@ -125,7 +186,6 @@ Please refer to [install.md](https://mmaction2.readthedocs.io/en/1.x/get_started
Action Localization |
- SSN (ICCV'2017) |
BSN (ECCV'2018) |
BMN (ICCV'2019) |
|
@@ -149,17 +209,19 @@ Please refer to [install.md](https://mmaction2.readthedocs.io/en/1.x/get_started
2s-AGCN (CVPR'2019) |
PoseC3D (CVPR'2022) |
STGCN++ (ArXiv'2022) |
+ CTRGCN (CVPR'2021) |
+
+
+ MSG3D (CVPR'2020) |
|
-Results and models are available in the *README.md* of each method's config directory.
-A summary can be found on the [**model zoo**](https://mmaction2.readthedocs.io/en/1.x/modelzoo.html) page.
+
-We will keep up with the latest progress of the community and support more popular algorithms and frameworks.
-If you have any feature requests, please feel free to leave a comment in [Issues](https://github.com/open-mmlab/mmaction2/issues/19).
+
-## Supported Datasets
+Supported dataset
@@ -218,31 +280,32 @@ If you have any feature requests, please feel free to leave a comment in [Issues
-Datasets marked with * are not fully supported yet, but related dataset preparation steps are provided. A summary can be found on the [**Supported Datasets**](https://mmaction2.readthedocs.io/en/latest/supported_datasets.html) page.
-
-## Data Preparation
-
-Please refer to [data_preparation.md](docs/en/user_guides/2_data_prepare.md) for a general knowledge of data preparation.
+
-## FAQ
+## 👨🏫 Get Started [🔝](#-table-of-contents)
-Please refer to [FAQ](docs/en/notes/faq.md) for frequently asked questions.
+For tutorials, we provide the following user guides for basic usage:
-## Projects built on MMAction2
+- [Migration from MMAction2 0.X](https://mmaction2.readthedocs.io/en/latest/migration.html)
+- [Learn about Configs](https://mmaction2.readthedocs.io/en/latest/user_guides/1_config.html#)
+- [Prepare Datasets](https://mmaction2.readthedocs.io/en/latest/user_guides/2_data_prepare.html)
+- [Inference with Existing Models](https://mmaction2.readthedocs.io/en/latest/user_guides/3_inference.html)
+- [Training and Testing](https://mmaction2.readthedocs.io/en/latest/user_guides/4_train_test.html)
-Currently, there are many research works and projects built on MMAction2 by users from community, such as:
+
+Research works built on MMAction2 by users from community
- Video Swin Transformer. [\[paper\]](https://arxiv.org/abs/2106.13230)[\[github\]](https://github.com/SwinTransformer/Video-Swin-Transformer)
- Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2107.10161)[\[github\]](https://github.com/Cogito2012/DEAR)
- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 **Oral**. [\[paper\]](https://arxiv.org/abs/2103.17263)[\[github\]](https://github.com/xvjiarui/VFS)
-etc., check [projects.md](docs/en/notes/projects.md) to see all related projects.
+
-## License
+## 🎫 License [🔝](#-table-of-contents)
This project is released under the [Apache 2.0 license](LICENSE).
-## Citation
+## 🖊️ Citation [🔝](#-table-of-contents)
If you find this project useful in your research, please consider cite:
@@ -255,17 +318,17 @@ If you find this project useful in your research, please consider cite:
}
```
-## Contributing
+## 🙌 Contributing [🔝](#-table-of-contents)
We appreciate all contributions to improve MMAction2. Please refer to [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING.md) in MMCV for more details about the contributing guideline.
-## Acknowledgement
+## 🤝 Acknowledgement [🔝](#-table-of-contents)
MMAction2 is an open-source project that is contributed by researchers and engineers from various colleges and companies.
We appreciate all the contributors who implement their methods or add new features and users who give valuable feedback.
We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and develop their new models.
-## Projects in OpenMMLab
+## 🏗️ Projects in OpenMMLab [🔝](#-table-of-contents)
- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
@@ -273,6 +336,7 @@ We wish that the toolbox and benchmark could serve the growing research communit
- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
diff --git a/README_zh-CN.md b/README_zh-CN.md
new file mode 100644
index 0000000000..493d2da15e
--- /dev/null
+++ b/README_zh-CN.md
@@ -0,0 +1,324 @@
+
+
+
+
+
+[![Documentation](https://readthedocs.org/projects/mmaction2/badge/?version=latest)](https://mmaction2.readthedocs.io/en/latest/)
+[![actions](https://github.com/open-mmlab/mmaction2/workflows/build/badge.svg)](https://github.com/open-mmlab/mmaction2/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmaction2/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmaction2)
+[![PyPI](https://img.shields.io/pypi/v/mmaction2)](https://pypi.org/project/mmaction2/)
+[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/blob/master/LICENSE)
+[![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
+[![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmaction2.svg)](https://github.com/open-mmlab/mmaction2/issues)
+
+[📘文档](https://mmaction2.readthedocs.io/zh_CN/1.x/) |
+[🛠️安装指南](https://mmaction2.readthedocs.io/zh_CN/1.x/get_started.html) |
+[👀模型库](https://mmaction2.readthedocs.io/zh_CN/1.x/modelzoo.html) |
+[🆕更新](https://mmaction2.readthedocs.io/zh_CN/1.x/notes/changelog.html) |
+[🚀进行中项目](https://github.com/open-mmlab/mmaction2/projects) |
+[🤔问题反馈](https://github.com/open-mmlab/mmaction2/issues/new/choose)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+[English](/README.md) | 简体中文
+
+## 简介
+
+MMAction2 是一款基于 PyTorch 的视频理解开源工具箱,是 [OpenMMLab](https://openmmlab.com/) 项目的成员之一
+
+1.x 分支代码目前支持 **PyTorch 1.6以上** 的版本
+
+
+
+
+
Kinetics-400 上的动作识别
+
+
+
+
NTURGB+D-120 上的基于人体姿态的动作识别
+
+
+
+
+
Kinetics-400 上的基于 skeleton 的时空动作检测和动作识别
+
+
+
+
AVA-2.1 上的时空动作检测
+
+
+## 主要特性
+
+- **模块设计**:MMAction2 将统一的视频理解框架解耦成不同的模块组件,通过组合不同的模块组件,用户可以便捷地构建自定义的视频理解模型
+
+- **支持多种任务和数据集**:MMAction2 支持多种视频理解任务,包括动作识别,时序动作检测,时空动作检测以及基于人体姿态的动作识别
+
+- **详尽的单元测试和文档**:MMAction2 提供了详尽的说明文档,API 接口说明,全面的单元测试,以供社区参考
+
+## 更新记录
+
+**v1.0.0 版本 (2023.04.06)**:
+
+- 支持骨骼动作识别模型 RGB-PoseC3D (CVPR'2022) .
+- 在 Projects 中支持 MSG3D(CVPR'2020) 和 CTRGCN(CVPR'2021).
+- 支持训练 UniFormer V2(Arxiv'2022).
+- 重构升级用户文档
+
+## 安装
+
+MMAction2 依赖 [PyTorch](https://pytorch.org/), [MMCV](https://github.com/open-mmlab/mmcv), [MMEngine](https://github.com/open-mmlab/mmengine), [MMDetection](https://github.com/open-mmlab/mmdetection) (可选), [MMPose](https://github.com/open-mmlab/mmpose) (可选),以下是安装的简要步骤。
+更详细的安装指南请参考 [install.md](https://mmaction2.readthedocs.io/zh_CN/1.x/get_started.html) 。
+
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate open-mmlab
+conda install pytorch torchvision -c pytorch # 以上命令将自动安装最新版本的 PyTorch 和 cudatoolkit,请检查它们是否和你的环境匹配
+pip install -U openmim
+mim install mmengine 'mmcv>=2.0.0'
+mim install "mmdet>=3.0.0" # 可选
+mim install "mmpose>=1.0.0" # 可选
+git clone https://github.com/open-mmlab/mmaction2.git
+cd mmaction2
+git checkout 1.x
+pip3 install -e .
+```
+
+## 模型库
+
+
+
+各个模型的结果和设置都可以在对应的 config 目录下的 *README_zh-CN.md* 中查看。整体的概况也可也在 [**模型库**](https://mmaction2.readthedocs.io/zh_CN/1.x/modelzoo.html) 页面中查看。
+
+MMAction2 将跟进学界的最新进展,并支持更多算法和框架。如果您对 MMAction2 有任何功能需求,请随时在 [问题](https://github.com/open-mmlab/mmaction2/issues/19) 中留言。
+
+## 数据集
+
+
+
+标记 * 代表对应数据集并未被完全支持,但提供相应的数据准备步骤。整体的概况也可也在 [**数据集**](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 页面中查看。
+
+## 数据集准备
+
+请参考 [数据准备](https://mmaction2.readthedocs.io/en/latest/user_guides/2_data_prepare.html) 了解数据集准备概况。所有支持的数据集都列于 [数据集清单](https://mmaction2.readthedocs.io/zh_CN/latest/supported_datasets.html) 中。
+
+## FAQ
+
+请参考 [FAQ](docs/zh_cn/notes/faq.md) 了解其他用户的常见问题。
+
+## 相关工作
+
+目前有许多研究工作或工程项目基于 MMAction2 搭建,例如:
+
+- Video Swin Transformer. [\[论文\]](https://arxiv.org/abs/2106.13230)[\[代码\]](https://github.com/SwinTransformer/Video-Swin-Transformer)
+- Evidential Deep Learning for Open Set Action Recognition, ICCV 2021 **Oral**. [\[论文\]](https://arxiv.org/abs/2107.10161)[\[代码\]](https://github.com/Cogito2012/DEAR)
+- Rethinking Self-supervised Correspondence Learning: A Video Frame-level Similarity Perspective, ICCV 2021 **Oral**. [\[论文\]](https://arxiv.org/abs/2103.17263)[\[代码\]](https://github.com/xvjiarui/VFS)
+
+更多详情可见 [相关工作](docs/en/notes/projects.md) 。
+
+## 许可
+
+该项目开源自 [Apache 2.0 license](LICENSE).
+
+## 引用
+
+如果你觉得 MMAction2 对你的研究有所帮助,可以考虑引用它:
+
+```BibTeX
+@misc{2020mmaction2,
+ title={OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark},
+ author={MMAction2 Contributors},
+ howpublished = {\url{https://github.com/open-mmlab/mmaction2}},
+ year={2020}
+}
+```
+
+## 参与贡献
+
+我们非常欢迎用户对于 MMAction2 做出的任何贡献,可以参考 [贡献指南](https://github.com/open-mmlab/mmcv/blob/2.x/CONTRIBUTING_zh-CN.md) 文件了解更多细节。
+
+## 致谢
+
+MMAction2 是一款由不同学校和公司共同贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者,以及提供宝贵反馈的用户。
+我们希望该工具箱和基准测试可以为社区提供灵活的代码工具,供用户复现现有算法并开发自己的新模型,从而不断为开源社区提供贡献。
+
+## OpenMMLab 的其他项目
+
+- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱和基准测试
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
diff --git a/configs/detection/_base_/models/slowonly_r50.py b/configs/detection/_base_/models/slowonly_r50.py
deleted file mode 100644
index 4a06a4ab53..0000000000
--- a/configs/detection/_base_/models/slowonly_r50.py
+++ /dev/null
@@ -1,54 +0,0 @@
-url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
- 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
- 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
- 'kinetics400-rgb_20220901-e7b65fad.pth')
-
-model = dict(
- type='FastRCNN',
- _scope_='mmdet',
- init_cfg=dict(type='Pretrained', checkpoint=url),
- backbone=dict(
- type='ResNet3dSlowOnly',
- depth=50,
- pretrained=None,
- pretrained2d=False,
- lateral=False,
- num_stages=4,
- conv1_kernel=(1, 7, 7),
- conv1_stride_t=1,
- pool1_stride_t=1,
- spatial_strides=(1, 2, 2, 1)),
- roi_head=dict(
- type='AVARoIHead',
- bbox_roi_extractor=dict(
- type='SingleRoIExtractor3D',
- roi_layer_type='RoIAlign',
- output_size=8,
- with_temporal_pool=True),
- bbox_head=dict(
- type='BBoxHeadAVA',
- in_channels=2048,
- num_classes=81,
- multilabel=True,
- dropout_ratio=0.5)),
- data_preprocessor=dict(
- type='ActionDataPreprocessor',
- _scope_='mmaction',
- mean=[123.675, 116.28, 103.53],
- std=[58.395, 57.12, 57.375],
- format_shape='NCTHW'),
- train_cfg=dict(
- rcnn=dict(
- assigner=dict(
- type='MaxIoUAssignerAVA',
- pos_iou_thr=0.9,
- neg_iou_thr=0.9,
- min_pos_iou=0.9),
- sampler=dict(
- type='RandomSampler',
- num=32,
- pos_fraction=1,
- neg_pos_ub=-1,
- add_gt_as_proposals=True),
- pos_weight=1.0)),
- test_cfg=dict(rcnn=None))
diff --git a/configs/detection/_base_/models/slowonly_r50_nl.py b/configs/detection/_base_/models/slowonly_r50_nl.py
deleted file mode 100644
index 6dcdc30bfc..0000000000
--- a/configs/detection/_base_/models/slowonly_r50_nl.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# model setting
-model = dict(
- type='mmdet.FastRCNN',
- backbone=dict(
- type='ResNet3dSlowOnly',
- depth=50,
- pretrained=None,
- pretrained2d=False,
- lateral=False,
- num_stages=4,
- conv1_kernel=(1, 7, 7),
- conv1_stride_t=1,
- pool1_stride_t=1,
- spatial_strides=(1, 2, 2, 1),
- norm_cfg=dict(type='BN3d', requires_grad=True),
- non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
- non_local_cfg=dict(
- sub_sample=True,
- use_scale=True,
- norm_cfg=dict(type='BN3d', requires_grad=True),
- mode='embedded_gaussian')),
- roi_head=dict(
- type='AVARoIHead',
- bbox_roi_extractor=dict(
- type='SingleRoIExtractor3D',
- roi_layer_type='RoIAlign',
- output_size=8,
- with_temporal_pool=True),
- bbox_head=dict(
- type='BBoxHeadAVA',
- in_channels=2048,
- num_classes=81,
- multilabel=True,
- dropout_ratio=0.5)),
- train_cfg=dict(
- rcnn=dict(
- assigner=dict(
- type='MaxIoUAssignerAVA',
- pos_iou_thr=0.9,
- neg_iou_thr=0.9,
- min_pos_iou=0.9,
- iou_calculator=dict(type='mmdet.BboxOverlaps2D')),
- sampler=dict(
- type='mmdet.RandomSampler',
- num=32,
- pos_fraction=1,
- neg_pos_ub=-1,
- add_gt_as_proposals=True),
- pos_weight=1.0,
- debug=False)),
- test_cfg=dict(rcnn=None))
diff --git a/configs/detection/acrn/README.md b/configs/detection/acrn/README.md
index a9af00da0c..054853c35a 100644
--- a/configs/detection/acrn/README.md
+++ b/configs/detection/acrn/README.md
@@ -20,23 +20,19 @@ Current state-of-the-art approaches for spatio-temporal action localization rely
### AVA2.1
-| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | gpu_mem(M) | config | ckpt | log |
-| :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :--------: | :---------------------------------------: | :-------------------------------------: | :-------------------------------------: |
-| 8x8x1 | raw | 8 | SlowFast ResNet50 | Kinetics-400 | 27.58 | 15263 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb_20220906-0dae1a90.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.log) |
+| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log |
+| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: |
+| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 27.65 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb_20220906-0dae1a90.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.log) |
### AVA2.2
-| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | gpu_mem(M) | config | ckpt | log |
-| :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :--------: | :---------------------------------------: | :-------------------------------------: | :-------------------------------------: |
-| 8x8x1 | raw | 8 | SlowFast ResNet50 | Kinetics-400 | 27.63 | 15263 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb_20220906-66ec24a2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.log) |
+| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log |
+| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: |
+| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 27.71 | [config](/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb_20220906-66ec24a2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.log) |
-Note:
+1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
-1. The **gpus** indicates the number of gpu we used to get the checkpoint.
- According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
- e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
-
-For more details on data preparation, you can refer to to [AVA Data Preparation](/tools/data/ava/README.md).
+For more details on data preparation, you can refer to [AVA](/tools/data/ava/README.md).
## Train
@@ -46,14 +42,14 @@ You can use the following command to train a model.
python tools/train.py ${CONFIG_FILE} [optional arguments]
```
-Example: train ACRN with SlowFast backbone on AVA in a deterministic option.
+Example: train ACRN with SlowFast backbone on AVA2.1 in a deterministic option with periodic validation.
```shell
python tools/train.py configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py \
- --cfg-options randomness.seed=0 randomness.deterministic=True
+ --seed 0 --deterministic
```
-For more details and optional arguments infos, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -63,29 +59,17 @@ You can use the following command to test a model.
python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
```
-Example: test ACRN with SlowFast backbone on AVA and dump the result to a pkl file.
+Example: test ACRN with SlowFast backbone on AVA2.1 and dump the result to a pkl file.
```shell
python tools/test.py configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py \
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details and optional arguments infos, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
-
-
-```BibTeX
-@inproceedings{gu2018ava,
- title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
- author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
- booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
- pages={6047--6056},
- year={2018}
-}
-```
-
```BibTeX
@inproceedings{sun2018actor,
title={Actor-centric relation network},
diff --git a/configs/detection/acrn/metafile.yml b/configs/detection/acrn/metafile.yml
index 3212cb7dc8..9db11da474 100644
--- a/configs/detection/acrn/metafile.yml
+++ b/configs/detection/acrn/metafile.yml
@@ -1,9 +1,9 @@
Collections:
-- Name: ACRN
- README: configs/detection/acrn/README.md
- Paper:
- URL: https://arxiv.org/abs/1807.10982
- Title: "Actor-Centric Relation Network"
+ - Name: ACRN
+ README: configs/detection/acrn/README.md
+ Paper:
+ URL: https://arxiv.org/abs/1807.10982
+ Title: "Actor-Centric Relation Network"
Models:
- Name: slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb
@@ -14,7 +14,6 @@ Models:
Batch Size: 8
Epochs: 10
Pretrained: Kinetics-400
- Resolution: short-side 320
Training Data: AVA v2.1
Training Resources: 8 GPUs
Modality: RGB
@@ -22,7 +21,7 @@ Models:
- Dataset: AVA v2.1
Task: Action Detection
Metrics:
- mAP: 27.58
+ mAP: 27.65
Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.log
Weights: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb_20220906-0dae1a90.pth
@@ -34,14 +33,13 @@ Models:
Batch Size: 8
Epochs: 10
Pretrained: Kinetics-400
- Resolution: short-side 320
Training Data: AVA v2.2
Training Resources: 8 GPUs
Modality: RGB
Results:
- - Dataset: AVA v2.1
+ - Dataset: AVA v2.2
Task: Action Detection
Metrics:
- mAP: 27.63
+ mAP: 27.71
Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.log
Weights: https://download.openmmlab.com/mmaction/v1.0/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb_20220906-66ec24a2.pth
diff --git a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py
index 641364bcce..10928a96ee 100644
--- a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py
+++ b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py
@@ -1,16 +1,16 @@
-_base_ = [
- '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/'
+ 'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
+ 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')
model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
backbone=dict(
- _delete_=True,
- type='ResNet3dSlowFast',
- _scope_='mmaction',
- pretrained=(
- 'https://download.openmmlab.com/mmaction/recognition/slowfast/'
- 'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
- 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth'),
+ type='mmaction.ResNet3dSlowFast',
+ pretrained=None,
resample_rate=4,
speed_ratio=4,
channel_ratio=8,
@@ -37,17 +37,44 @@
pool1_stride_t=1,
spatial_strides=(1, 2, 2, 1))),
roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
shared_head=dict(type='ACRNHead', in_channels=4608, out_channels=2304),
- bbox_head=dict(in_channels=2304)))
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2304,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
anno_root = 'data/ava/annotations'
-proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
- 'recall_93.9.pkl')
-proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
-
ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
@@ -56,9 +83,17 @@
label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+ 'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+file_client_args = dict(
+ io_backend='petrel',
+ path_mapping=dict({'data/ava': 's254:s3://openmmlab/datasets/action/ava'}))
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
- dict(type='RawFrameDecode'),
+ dict(type='RawFrameDecode', **file_client_args),
dict(type='RandomRescale', scale_range=(256, 320)),
dict(type='RandomCrop', size=256),
dict(type='Flip', flip_ratio=0.5),
@@ -69,7 +104,7 @@
val_pipeline = [
dict(
type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True),
- dict(type='RawFrameDecode'),
+ dict(type='RawFrameDecode', **file_client_args),
dict(type='Resize', scale=(-1, 256)),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
dict(type='PackActionInputs')
diff --git a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py
index 02992c654a..4537d25cc7 100644
--- a/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py
+++ b/configs/detection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py
@@ -1,5 +1,75 @@
-_base_ = [('slowfast-acrn_kinetics400-pretrained-r50'
- '_8xb8-8x8x1-cosine-10e_ava21-rgb.py')]
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/'
+ 'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
+ 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')
+
+model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
+ backbone=dict(
+ type='mmaction.ResNet3dSlowFast',
+ pretrained=None,
+ resample_rate=4,
+ speed_ratio=4,
+ channel_ratio=8,
+ slow_pathway=dict(
+ type='resnet3d',
+ depth=50,
+ pretrained=None,
+ lateral=True,
+ fusion_kernel=7,
+ conv1_kernel=(1, 7, 7),
+ dilations=(1, 1, 1, 1),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ inflate=(0, 0, 1, 1),
+ spatial_strides=(1, 2, 2, 1)),
+ fast_pathway=dict(
+ type='resnet3d',
+ depth=50,
+ pretrained=None,
+ lateral=False,
+ base_channels=8,
+ conv1_kernel=(5, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1))),
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ shared_head=dict(type='ACRNHead', in_channels=4608, out_channels=2304),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2304,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
@@ -17,9 +87,13 @@
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+file_client_args = dict(io_backend='disk')
+file_client_args = dict(
+ io_backend='petrel',
+ path_mapping=dict({'data/ava': 's254:s3://openmmlab/datasets/action/ava'}))
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
- dict(type='RawFrameDecode'),
+ dict(type='RawFrameDecode', **file_client_args),
dict(type='RandomRescale', scale_range=(256, 320)),
dict(type='RandomCrop', size=256),
dict(type='Flip', flip_ratio=0.5),
@@ -30,7 +104,7 @@
val_pipeline = [
dict(
type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True),
- dict(type='RawFrameDecode'),
+ dict(type='RawFrameDecode', **file_client_args),
dict(type='Resize', scale=(-1, 256)),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
dict(type='PackActionInputs')
@@ -71,3 +145,30 @@
label_file=label_file,
exclude_file=exclude_file_val)
test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=0.1,
+ by_epoch=True,
+ begin=0,
+ end=2,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ T_max=8,
+ eta_min=0,
+ by_epoch=True,
+ begin=2,
+ end=10,
+ convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001),
+ clip_grad=dict(max_norm=40, norm_type=2))
diff --git a/configs/detection/ava/README.md b/configs/detection/ava/README.md
deleted file mode 100644
index 1f6354641b..0000000000
--- a/configs/detection/ava/README.md
+++ /dev/null
@@ -1,125 +0,0 @@
-# AVA
-
-[Ava: A video dataset of spatio-temporally localized atomic visual actions](https://openaccess.thecvf.com/content_cvpr_2018/html/Gu_AVA_A_Video_CVPR_2018_paper.html)
-
-
-
-
-
-
-
-## Abstract
-
-
-
-This paper introduces a video dataset of spatio-temporally localized Atomic Visual Actions (AVA). The AVA dataset densely annotates 80 atomic visual actions in 430 15-minute video clips, where actions are localized in space and time, resulting in 1.58M action labels with multiple labels per person occurring frequently. The key characteristics of our dataset are: (1) the definition of atomic visual actions, rather than composite actions; (2) precise spatio-temporal annotations with possibly multiple annotations for each person; (3) exhaustive annotation of these atomic actions over 15-minute video clips; (4) people temporally linked across consecutive segments; and (5) using movies to gather a varied set of action representations. This departs from existing datasets for spatio-temporal action recognition, which typically provide sparse annotations for composite actions in short video clips. We will release the dataset publicly.
-AVA, with its realistic scene and action complexity, exposes the intrinsic difficulty of action recognition. To benchmark this, we present a novel approach for action localization that builds upon the current state-of-the-art methods, and demonstrates better performance on JHMDB and UCF101-24 categories. While setting a new state of the art on existing datasets, the overall results on AVA are low at 15.6% mAP, underscoring the need for developing new approaches for video understanding.
-
-
-
-
-
-
-
-
-
-```BibTeX
-@inproceedings{feichtenhofer2019slowfast,
- title={Slowfast networks for video recognition},
- author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming},
- booktitle={Proceedings of the IEEE international conference on computer vision},
- pages={6202--6211},
- year={2019}
-}
-```
-
-## Results and Models
-
-### AVA2.1
-
-| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | gpu_mem(M) | config | ckpt | log |
-| :---------------------: | :--------: | :--: | :----------------------------------: | :----------: | :---: | :--------: | :---------------------------------: | :-------------------------------: | :------------------------------: |
-| 4x16x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-400 | 20.76 | 8503 | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
-| 4x16x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 22.77 | 8503 | [config](/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
-| 4x16x1 | raw | 8 | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 21.49 | 11870 | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log) |
-| 8x8x1 | raw | 8 | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 23.74 | 25375 | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log) |
-| 8x8x1 | raw | 8 | SlowOnly ResNet101 | Kinetics-400 | 24.82 | 23477 | [config](/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log) |
-| 4x16x1 | raw | 8 | SlowFast ResNet50 | Kinetics-400 | 24.27 | 18616 | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
-| 4x16x1 | raw | 8 | SlowFast ResNet50 (with context) | Kinetics-400 | 25.25 | 18616 | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log) |
-| 8x8x1 | raw | 8 | SlowFast ResNet50 | Kinetics-400 | 25.73 | 13802 | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log) |
-
-### AVA2.2
-
-| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | gpu_mem(M) | config | ckpt | log |
-| :---------------------: | :--------: | :--: | :----------------------------------: | :----------: | :---: | :--------: | :---------------------------------: | :-------------------------------: | :------------------------------: |
-| 8x8x1 | raw | 8 | SlowFast ResNet50 | Kinetics-400 | 25.82 | 10484 | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
-| 8x8x1 | raw | 8 | SlowFast ResNet50 (temporal-max) | Kinetics-400 | 26.32 | 10484 | [config](/configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
-| 8x8x1 | raw | 8 | SlowFast ResNet50 (temporal-max, focal loss) | Kinetics-400 | 26.58 | 10484 | [config](/configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
-
-Note:
-
-1. The **gpus** indicates the number of gpu we used to get the checkpoint.
- According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
- e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
-2. **With context** indicates that using both RoI feature and global pooled feature for classification, which leads to around 1% mAP improvement in general.
-
-:::
-
-For more details on data preparation, you can refer to [AVA Data Preparation](/tools/data/ava/README.md).
-
-## Train
-
-You can use the following command to train a model.
-
-```shell
-python tools/train.py ${CONFIG_FILE} [optional arguments]
-```
-
-Example: train the SlowOnly model on AVA in a deterministic option.
-
-```shell
-python tools/train.py configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
- --cfg-options randomness.seed=0 randomness.deterministic=True
-```
-
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
-
-## Test
-
-You can use the following command to test a model.
-
-```shell
-python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
-```
-
-Example: test the SlowOnly model on AVA and dump the result to a pkl file.
-
-```shell
-python tools/test.py configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
- checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
-```
-
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
-
-## Citation
-
-
-
-```BibTeX
-@inproceedings{gu2018ava,
- title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
- author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
- booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
- pages={6047--6056},
- year={2018}
-}
-```
-
-```BibTeX
-@article{duan2020omni,
- title={Omni-sourced Webly-supervised Learning for Video Recognition},
- author={Duan, Haodong and Zhao, Yue and Xiong, Yuanjun and Liu, Wentao and Lin, Dahua},
- journal={arXiv preprint arXiv:2003.13042},
- year={2020}
-}
-```
diff --git a/configs/detection/ava/metafile.yml b/configs/detection/ava/metafile.yml
deleted file mode 100644
index ec745ad5c4..0000000000
--- a/configs/detection/ava/metafile.yml
+++ /dev/null
@@ -1,227 +0,0 @@
-Collections:
-- Name: AVA
- README: configs/detection/ava/README.md
- Paper:
- URL: https://arxiv.org/abs/1705.08421
- Title: "AVA: A Video Dataset of Spatio-temporally Localized Atomic Visual Actions"
-
-Models:
- - Name: slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
- Config: configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
- In Collection: AVA
- Metadata:
- Architecture: ResNet50
- Batch Size: 16
- Epochs: 20
- Pretrained: Kinetics-400
- Resolution: short-side 320
- Training Data: AVA v2.1
- Training Resources: 8 GPUs
- Modality: RGB
- Results:
- - Dataset: AVA v2.1
- Task: Action Detection
- Metrics:
- mAP: 20.76
- Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth
-
- - Name: slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
- Config: configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
- In Collection: AVA
- Metadata:
- Architecture: ResNet50
- Batch Size: 16
- Epochs: 20
- Pretrained: Kinetics-700
- Resolution: short-side 320
- Training Data: AVA v2.1
- Training Resources: 8 GPUs
- Modality: RGB
- Results:
- - Dataset: AVA v2.1
- Task: Action Detection
- Metrics:
- mAP: 22.77
- Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth
-
- - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb
- Config: configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
- In Collection: AVA
- Metadata:
- Architecture: ResNet50
- Batch Size: 16
- Epochs: 20
- Pretrained: Kinetics-400
- Resolution: short-side 320
- Training Data: AVA v2.1
- Training Resources: 8 GPUs
- Modality: RGB
- Results:
- - Dataset: AVA v2.1
- Task: Action Detection
- Metrics:
- mAP: 21.49
- Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth
-
- - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb
- Config: configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
- In Collection: AVA
- Metadata:
- Architecture: ResNet50
- Batch Size: 16
- Epochs: 20
- Pretrained: Kinetics-400
- Resolution: short-side 320
- Training Data: AVA v2.1
- Training Resources: 8 GPUs
- Modality: RGB
- Results:
- - Dataset: AVA v2.1
- Task: Action Detection
- Metrics:
- mAP: 23.47
- Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth
-
- - Name: slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb
- Config: configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
- In Collection: AVA
- Metadata:
- Architecture: ResNet101
- Batch Size: 16
- Epochs: 20
- Pretrained: Kinetics-400
- Resolution: short-side 320
- Training Data: AVA v2.1
- Training Resources: 8 GPUs
- Modality: RGB
- Results:
- - Dataset: AVA v2.1
- Task: Action Detection
- Metrics:
- mAP: 24.82
- Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth
-
- - Name: slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
- Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
- In Collection: AVA
- Metadata:
- Architecture: ResNet50
- Batch Size: 16
- Epochs: 20
- Pretrained: Kinetics-400
- Resolution: short-side 320
- Training Data: AVA v2.1
- Training Resources: 8 GPUs
- Modality: RGB
- Results:
- - Dataset: AVA v2.1
- Task: Action Detection
- Metrics:
- mAP: 24.27
- Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth
-
- - Name: slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb
- Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py
- In Collection: AVA
- Metadata:
- Architecture: ResNet50
- Batch Size: 16
- Epochs: 20
- Pretrained: Kinetics-400
- Resolution: short-side 320
- Training Data: AVA v2.1
- Training Resources: 8 GPUs
- Modality: RGB
- Results:
- - Dataset: AVA v2.1
- Task: Action Detection
- Metrics:
- mAP: 25.25
- Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth
-
- - Name: slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb
- Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
- In Collection: AVA
- Metadata:
- Architecture: ResNet50
- Batch Size: 8
- Epochs: 20
- Pretrained: Kinetics-400
- Resolution: short-side 320
- Training Data: AVA v2.1
- Training Resources: 8 GPUs
- Modality: RGB
- Results:
- - Dataset: AVA v2.1
- Task: Action Detection
- Metrics:
- mAP: 25.73
- Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth
-
- - Name: slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb
- Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
- In Collection: AVA
- Metadata:
- Architecture: ResNet50
- Batch Size: 6
- Epochs: 20
- Pretrained: Kinetics-400
- Resolution: short-side 320
- Training Data: AVA v2.2
- Training Resources: 8 GPUs
- Modality: RGB
- Results:
- - Dataset: AVA v2.2
- Task: Action Detection
- Metrics:
- mAP: 25.98
- Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth
-
- - Name: slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb
- Config: configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py
- In Collection: AVA
- Metadata:
- Architecture: ResNet50
- Batch Size: 6
- Epochs: 20
- Pretrained: Kinetics-400
- Resolution: short-side 320
- Training Data: AVA v2.2
- Training Resources: 8 GPUs
- Modality: RGB
- Results:
- - Dataset: AVA v2.2
- Task: Action Detection
- Metrics:
- mAP: 26.38
- Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth
-
- - Name: slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb
- Config: configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py
- In Collection: AVA
- Metadata:
- Architecture: ResNet50
- Batch Size: 6
- Epochs: 20
- Pretrained: Kinetics-400
- Resolution: short-side 320
- Training Data: AVA v2.2
- Training Resources: 8 GPUs
- Modality: RGB
- Results:
- - Dataset: AVA v2.2
- Task: Action Detection
- Metrics:
- mAP: 26.59
- Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth
diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py b/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
deleted file mode 100644
index 97e0197a6e..0000000000
--- a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
+++ /dev/null
@@ -1,50 +0,0 @@
-_base_ = ['slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py']
-
-model = dict(
- backbone=dict(
- resample_rate=4,
- speed_ratio=4,
- slow_pathway=dict(fusion_kernel=7),
- pretrained=(
- 'https://download.openmmlab.com/mmaction/recognition/slowfast/'
- 'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
- 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')))
-
-dataset_type = 'AVADataset'
-data_root = 'data/ava/rawframes'
-anno_root = 'data/ava/annotations'
-
-ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
-exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
-label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
-
-proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
- 'recall_93.9.pkl')
-
-train_pipeline = [
- dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
- dict(type='RawFrameDecode'),
- dict(type='RandomRescale', scale_range=(256, 320)),
- dict(type='RandomCrop', size=256),
- dict(type='Flip', flip_ratio=0.5),
- dict(type='FormatShape', input_format='NCTHW', collapse=True),
- dict(type='PackActionInputs')
-]
-
-train_dataloader = dict(
- batch_size=8,
- num_workers=8,
- persistent_workers=True,
- sampler=dict(type='DefaultSampler', shuffle=True),
- dataset=dict(
- type=dataset_type,
- ann_file=ann_file_train,
- exclude_file=exclude_file_train,
- pipeline=train_pipeline,
- label_file=label_file,
- proposal_file=proposal_file_train,
- data_prefix=dict(img=data_root)))
-
-optim_wrapper = dict(
- optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001),
- clip_grad=dict(max_norm=40, norm_type=2))
diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
deleted file mode 100644
index 815e61c2fc..0000000000
--- a/configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
+++ /dev/null
@@ -1,72 +0,0 @@
-_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py']
-
-model = dict(
- backbone=dict(
- depth=101,
- pretrained=(
- 'https://download.openmmlab.com/mmaction/recognition/slowonly/'
- 'omni/slowonly_r101_without_omni_8x8x1_kinetics400_rgb_'
- '20200926-0c730aef.pth')))
-
-dataset_type = 'AVADataset'
-data_root = 'data/ava/rawframes'
-anno_root = 'data/ava/annotations'
-
-ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
-ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
-
-exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
-exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
-
-label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
-
-proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
- 'recall_93.9.pkl')
-proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
-
-train_pipeline = [
- dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
- dict(type='RawFrameDecode'),
- dict(type='RandomRescale', scale_range=(256, 320)),
- dict(type='RandomCrop', size=256),
- dict(type='Flip', flip_ratio=0.5),
- dict(type='FormatShape', input_format='NCTHW', collapse=True),
- dict(type='PackActionInputs')
-]
-# The testing is w/o. any cropping / flipping
-val_pipeline = [
- dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True),
- dict(type='RawFrameDecode'),
- dict(type='Resize', scale=(-1, 256)),
- dict(type='FormatShape', input_format='NCTHW', collapse=True),
- dict(type='PackActionInputs')
-]
-
-train_dataloader = dict(
- batch_size=16,
- num_workers=8,
- persistent_workers=True,
- sampler=dict(type='DefaultSampler', shuffle=True),
- dataset=dict(
- type=dataset_type,
- ann_file=ann_file_train,
- exclude_file=exclude_file_train,
- pipeline=train_pipeline,
- label_file=label_file,
- proposal_file=proposal_file_train,
- data_prefix=dict(img=data_root)))
-val_dataloader = dict(
- batch_size=1,
- num_workers=8,
- persistent_workers=True,
- sampler=dict(type='DefaultSampler', shuffle=False),
- dataset=dict(
- type=dataset_type,
- ann_file=ann_file_val,
- exclude_file=exclude_file_val,
- pipeline=val_pipeline,
- label_file=label_file,
- proposal_file=proposal_file_val,
- data_prefix=dict(img=data_root),
- test_mode=True))
-test_dataloader = val_dataloader
diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
deleted file mode 100644
index 43b0fa1a28..0000000000
--- a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
+++ /dev/null
@@ -1,16 +0,0 @@
-_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py']
-
-model = dict(
- backbone=dict(
- pretrained=(
- 'https://download.openmmlab.com/mmaction/recognition/slowonly/'
- 'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb/'
- 'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb_'
- '20210308-0d6e5a69.pth'),
- norm_cfg=dict(type='BN3d', requires_grad=True),
- non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
- non_local_cfg=dict(
- sub_sample=True,
- use_scale=True,
- norm_cfg=dict(type='BN3d', requires_grad=True),
- mode='embedded_gaussian')))
diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
deleted file mode 100644
index a962f10c11..0000000000
--- a/configs/detection/ava/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
+++ /dev/null
@@ -1,74 +0,0 @@
-_base_ = [
- 'slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py'
-]
-
-model = dict(
- backbone=dict(
- pretrained=(
- 'https://download.openmmlab.com/mmaction/recognition/slowonly/'
- 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb/'
- 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb_'
- '20210308-e8dd9e82.pth')))
-
-dataset_type = 'AVADataset'
-data_root = 'data/ava/rawframes'
-anno_root = 'data/ava/annotations'
-
-ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
-ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
-
-exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
-exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
-
-label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
-
-proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
- 'recall_93.9.pkl')
-proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
-
-train_pipeline = [
- dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
- dict(type='RawFrameDecode'),
- dict(type='RandomRescale', scale_range=(256, 320)),
- dict(type='RandomCrop', size=256),
- dict(type='Flip', flip_ratio=0.5),
- dict(type='FormatShape', input_format='NCTHW', collapse=True),
- dict(type='PackActionInputs')
-]
-# The testing is w/o. any cropping / flipping
-val_pipeline = [
- dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True),
- dict(type='RawFrameDecode'),
- dict(type='Resize', scale=(-1, 256)),
- dict(type='FormatShape', input_format='NCTHW', collapse=True),
- dict(type='PackActionInputs')
-]
-
-train_dataloader = dict(
- batch_size=16,
- num_workers=8,
- persistent_workers=True,
- sampler=dict(type='DefaultSampler', shuffle=True),
- dataset=dict(
- type=dataset_type,
- ann_file=ann_file_train,
- exclude_file=exclude_file_train,
- pipeline=train_pipeline,
- label_file=label_file,
- proposal_file=proposal_file_train,
- data_prefix=dict(img=data_root)))
-val_dataloader = dict(
- batch_size=1,
- num_workers=8,
- persistent_workers=True,
- sampler=dict(type='DefaultSampler', shuffle=False),
- dataset=dict(
- type=dataset_type,
- ann_file=ann_file_val,
- exclude_file=exclude_file_val,
- pipeline=val_pipeline,
- label_file=label_file,
- proposal_file=proposal_file_val,
- data_prefix=dict(img=data_root),
- test_mode=True))
-test_dataloader = val_dataloader
diff --git a/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
deleted file mode 100644
index c9e10def96..0000000000
--- a/configs/detection/ava/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py']
-
-model = dict(
- backbone=dict(
- pretrained=(
- 'https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly'
- '/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
- 'kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-'
- 'steplr-150e_kinetics700-rgb_20220901-f73b3e89.pth')))
diff --git a/configs/detection/ava_kinetics/README.md b/configs/detection/ava_kinetics/README.md
deleted file mode 100644
index 59ec345c43..0000000000
--- a/configs/detection/ava_kinetics/README.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# AVA
-
-[The AVA-Kinetics Localized Human Actions Video Dataset](https://arxiv.org/abs/2005.00214)
-
-
-
-
-
-
-
-## Abstract
-
-
-
-This paper describes the AVA-Kinetics localized human actions video dataset. The dataset is collected by annotating videos from the Kinetics-700 dataset using the AVA annotation protocol, and extending the original AVA dataset with these new AVA annotated Kinetics clips. The dataset contains over 230k clips annotated with the 80 AVA action classes for each of the humans in key-frames. We describe the annotation process and provide statistics about the new dataset. We also include a baseline evaluation using the Video Action Transformer Network on the AVA-Kinetics dataset, demonstrating improved performance for action classification on the AVA test set.
-
-```BibTeX
-@article{li2020ava,
- title={The ava-kinetics localized human actions video dataset},
- author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew},
- journal={arXiv preprint arXiv:2005.00214},
- year={2020}
-}
-```
-
-## Results and Models
-
-### AVA2.2
-
-Currently, we only use the training set of AVA-Kinetics and evaluate on the AVA2.2 validation dataset. The AVA-Kinetics validation dataset will be supported soon.
-
-| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | config | ckpt | log |
-| :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :------------------------------------------: | :-----------------------------------------: | :----------------------------------------: |
-| 4x16x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-400 | 24.53 | [config](/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-33e3ca7c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) |
-| 4x16x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 25.87 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-a07e8c15.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) |
-| 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-400 | 26.10 | [config](/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-8f8dff3b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-| 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-
-### Training with tricks
-
-We conduct ablation studies to show the improvements of training tricks using SlowOnly8x8 pretrained on the Kinetics700 dataset. The baseline is the last raw in [AVA2.2](https://github.com/hukkai/mmaction2/tree/ava-kinetics-exp/configs/detection/ava_kinetics#ava22).
-
-| method | frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | config | ckpt | log |
-| :--------------------: | :---------------------: | :--------: | :--: | :---------------: | :----------: | :---: | :-----------------------------------: | :---------------------------------: | :---------------------------------: |
-| baseline | 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-| + context | 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 28.31 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5d514f8c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-| + temporal max pooling | 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 28.48 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5b5e71eb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-| + nonlinear head | 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 29.83 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-87624265.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
-| + focal loss | 8x8x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 30.33 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb_20221205-37aa8395.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.log) |
-| + more frames | 16x4x1 | raw | 8 | SlowOnly ResNet50 | Kinetics-700 | 31.29 | [config](/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb_20221205-dd652f81.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.log) |
-
-Note:
-
-The **gpus** indicates the number of gpu we used to get the checkpoint; **+ context** indicates that using both RoI feature and global pooled feature for classification; **+ temporal max pooling** indicates that using max pooling in the temporal dimension for the feature; **nonlinear head** indicates that using a 2-layer mlp instead of a linear classifier.
-
-For more details on data preparation, you can refer to [AVA-Kinetics Data Preparation](/tools/data/ava_kinetics/README.md).
-
-## Train
-
-You can use the following command to train a model.
-
-```shell
-python tools/train.py ${CONFIG_FILE} [optional arguments]
-```
-
-Example: train the SlowOnly model on AVA-Kinetics in a deterministic option.
-
-```shell
-python tools/train.py configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py \
- --cfg-options randomness.seed=0 randomness.deterministic=True
-```
-
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
-
-## Test
-
-You can use the following command to test a model.
-
-```shell
-python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
-```
-
-Example: test the SlowOnly model on AVA-Kinetics and dump the result to a pkl file.
-
-```shell
-python tools/test.py configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py \
- checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
-```
-
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
-
-## Citation
-
-
-
-```BibTeX
-@article{li2020ava,
- title={The ava-kinetics localized human actions video dataset},
- author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew},
- journal={arXiv preprint arXiv:2005.00214},
- year={2020}
-}
-```
diff --git a/configs/detection/lfb/README.md b/configs/detection/lfb/README.md
index 1d33a7d7e9..51af1377c8 100644
--- a/configs/detection/lfb/README.md
+++ b/configs/detection/lfb/README.md
@@ -22,7 +22,7 @@ To understand the world, we humans constantly need to relate the present to the
| frame sampling strategy | resolution | gpus | backbone | pretrain | mAP | gpu_mem(M) | config | ckpt | log |
| :---------------------: | :--------: | :--: | :----------------------------------: | :----------: | :---: | :--------: | :---------------------------------: | :-------------------------------: | :------------------------------: |
-| 4x16x1 | raw | 8 | SlowOnly ResNet50 (with Nonlocal LFB) | Kinetics-400 | 24.05 | 8620 | [config](/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4c5b9f25.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log) |
+| 4x16x1 | raw | 8 | SlowOnly ResNet50 (with Nonlocal LFB) | Kinetics-400 | 24.11 | 8620 | [config](/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4c5b9f25.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log) |
| 4x16x1 | raw | 8 | SlowOnly ResNet50 (with Max LFB) | Kinetics-400 | 22.15 | 8425 | [config](/configs/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4963135b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-max_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log) |
Note:
@@ -33,8 +33,7 @@ Note:
2. We use `slowonly_r50_4x16x1` instead of `I3D-R50-NL` in the original paper as the backbone of LFB, but we have achieved the similar improvement: (ours: 20.1 -> 24.05 vs. author: 22.1 -> 25.8).
3. Because the long-term features are randomly sampled in testing, the test accuracy may have some differences.
4. Before train or test lfb, you need to infer feature bank with the [slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py](/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py). For more details on infer feature bank, you can refer to [Train](#Train) part.
-5. You can also dowonload long-term feature bank from [AVA_train_val_float32_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float32_lfb.rar) or [AVA_train_val_float16_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float16_lfb.rar), and then put them on `lfb_prefix_path`.
-6. The ROIHead now supports single-label classification (i.e. the network outputs at most
+5. The ROIHead now supports single-label classification (i.e. the network outputs at most
one-label per actor). This can be done by (a) setting multilabel=False during training and
the test_cfg.rcnn.action_thr for testing.
@@ -42,7 +41,7 @@ Note:
### a. Infer long-term feature bank for training
-Before train or test lfb, you need to infer long-term feature bank first.
+Before train or test lfb, you need to infer long-term feature bank first. You can also dowonload long-term feature bank from [AVA_train_val_float32_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float32_lfb.rar) or [AVA_train_val_float16_lfb](https://download.openmmlab.com/mmaction/detection/lfb/AVA_train_val_float16_lfb.rar), and then put them on `lfb_prefix_path`. In this case, you can skip this step.
Specifically, run the test on the training, validation, testing dataset with the config file [slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py](/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py) (The config file will only infer the feature bank of training dataset and you need set `dataset_mode = 'val'` to infer the feature bank of validation dataset in the config file.), and the shared head [LFBInferHead](/mmaction/models/roi_heads/shared_heads/lfb_infer_head.py) will generate the feature bank.
@@ -52,12 +51,12 @@ You can use the following command to infer feature bank of AVA training and vali
```shell
# set `dataset_mode = 'train'` in lfb_slowonly_r50_ava_infer.py
-python tools/test.py slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py \
- checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
+python tools/test.py configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py \
+ checkpoints/YOUR_BASELINE_CHECKPOINT.pth
# set `dataset_mode = 'val'` in lfb_slowonly_r50_ava_infer.py
-python tools/test.py slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py \
- checkpoints/YOUR_BASELINE_CHECKPOINT.pth --eval mAP
+python tools/test.py configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py \
+ checkpoints/YOUR_BASELINE_CHECKPOINT.pth
```
We use [slowonly_r50_4x16x1 checkpoint](https://download.openmmlab.com/mmaction/detection/ava/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb/slowonly_kinetics_pretrained_r50_4x16x1_20e_ava_rgb_20201217-40061d5f.pth) from [slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb](/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) to infer feature bank.
@@ -77,7 +76,7 @@ python tools/train.py configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrain
--validate --seed 0 --deterministic
```
-For more details and optional arguments infos, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details and optional arguments infos, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -102,7 +101,7 @@ python tools/test.py configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretraine
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/detection/lfb/metafile.yml b/configs/detection/lfb/metafile.yml
index 055032ad18..c1de15768f 100644
--- a/configs/detection/lfb/metafile.yml
+++ b/configs/detection/lfb/metafile.yml
@@ -22,7 +22,7 @@ Models:
- Dataset: AVA v2.1
Task: Action Detection
Metrics:
- mAP: 24.05
+ mAP: 24.11
Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.log
Weights: https://download.openmmlab.com/mmaction/v1.0/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb_20220906-4c5b9f25.pth
diff --git a/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py
new file mode 100644
index 0000000000..278d87c1e1
--- /dev/null
+++ b/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py
@@ -0,0 +1,114 @@
+# This config is used to generate long-term feature bank.
+_base_ = '../../_base_/default_runtime.py'
+
+# model settings
+lfb_prefix_path = 'data/ava/lfb_half'
+dataset_mode = 'train' # ['train', 'val', 'test']
+
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+ 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
+ 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+ 'kinetics400-rgb_20220901-e7b65fad.pth')
+
+model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
+ backbone=dict(
+ type='mmaction.ResNet3dSlowOnly',
+ depth=50,
+ pretrained=None,
+ pretrained2d=False,
+ lateral=False,
+ num_stages=4,
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1)),
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2048,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5),
+ shared_head=dict(
+ type='LFBInferHead',
+ lfb_prefix_path=lfb_prefix_path,
+ dataset_mode=dataset_mode,
+ use_half_precision=True)),
+ data_preprocessor=dict(
+ type='ActionDataPreprocessor',
+ _scope_='mmaction',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
+
+# dataset settings
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_infer = f'{anno_root}/ava_{dataset_mode}_v2.1.csv'
+
+exclude_file_infer = (
+ f'{anno_root}/ava_{dataset_mode}_excluded_timestamps_v2.1.csv')
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_infer = (
+ f'{anno_root}/ava_dense_proposals_{dataset_mode}.FAIR.recall_93.9.pkl')
+
+infer_pipeline = [
+ dict(
+ type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+ dict(type='RawFrameDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(type='FormatShape', input_format='NCTHW', collapse=True),
+ dict(type='PackActionInputs')
+]
+
+test_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_infer,
+ exclude_file=exclude_file_infer,
+ pipeline=infer_pipeline,
+ label_file=label_file,
+ proposal_file=proposal_file_infer,
+ data_prefix=dict(img=data_root),
+ person_det_score_thr=0.9,
+ test_mode=True))
+
+test_cfg = dict(type='TestLoop')
+test_evaluator = dict(
+ type='AVAMetric',
+ ann_file=ann_file_infer,
+ label_file=label_file,
+ exclude_file=exclude_file_infer,
+ action_thr=0.0)
diff --git a/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py
index 2da2bd3a7c..9d323ad0e4 100644
--- a/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py
+++ b/configs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py
@@ -1,6 +1,4 @@
-_base_ = [
- '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
+_base_ = '../../_base_/default_runtime.py'
# model settings
lfb_prefix_path = 'data/ava/lfb_half'
@@ -10,8 +8,39 @@
lfb_channels = 2048
dataset_modes = ('train', 'val')
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+ 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
+ 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+ 'kinetics400-rgb_20220901-e7b65fad.pth')
+
model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
+ backbone=dict(
+ type='mmaction.ResNet3dSlowOnly',
+ depth=50,
+ pretrained=None,
+ pretrained2d=False,
+ lateral=False,
+ num_stages=4,
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1)),
roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2560,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5),
shared_head=dict(
type='FBOHead',
lfb_cfg=dict(
@@ -31,8 +60,28 @@
num_non_local_layers=2,
st_feat_dropout_ratio=0.2,
lt_feat_dropout_ratio=0.2,
- pre_activate=True)),
- bbox_head=dict(in_channels=2560)))
+ pre_activate=True))),
+ data_preprocessor=dict(
+ type='ActionDataPreprocessor',
+ _scope_='mmaction',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
diff --git a/configs/detection/slowfast/README.md b/configs/detection/slowfast/README.md
new file mode 100644
index 0000000000..f82273adcc
--- /dev/null
+++ b/configs/detection/slowfast/README.md
@@ -0,0 +1,96 @@
+# SlowFast
+
+[Slowfast networks for video recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html)
+
+
+
+## Abstract
+
+
+
+We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA.
+
+
+
+
+
+
+
+## Results and Models
+
+### AVA2.1
+
+| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log |
+| :---------------------: | :--: | :------------------------------: | :----------: | :---: | :-----------------------------------------: | :---------------------------------------: | :--------------------------------------: |
+| 4x16x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 24.32 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
+| 4x16x1 | 8 | SlowFast ResNet50 (with context) | Kinetics-400 | 25.34 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log) |
+| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 25.80 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log) |
+
+### AVA2.2
+
+| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log |
+| :---------------------: | :--: | :---------------------------------------: | :----------: | :---: | :--------------------------------------: | :------------------------------------: | :-----------------------------------: |
+| 8x8x1 | 8 | SlowFast ResNet50 | Kinetics-400 | 25.90 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
+| 8x8x1 | 8 | SlowFast ResNet50 (temporal-max) | Kinetics-400 | 26.41 | [config](/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
+| 8x8x1 | 8 | SlowFast ResNet50 (temporal-max, focal loss) | Kinetics-400 | 26.65 | [config](/configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log) |
+
+1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
+2. **with context** indicates that using both RoI feature and global pooled feature for classification; **temporal-max** indicates that using max pooling in the temporal dimension for the feature.
+
+For more details on data preparation, you can refer to [AVA](/tools/data/ava/README.md).
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train the SlowFast model on AVA2.1 in a deterministic option with periodic validation.
+
+```shell
+python tools/train.py configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
+ --seed 0 --deterministic
+```
+
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test the SlowFast model on AVA2.1 and dump the result to a pkl file.
+
+```shell
+python tools/test.py configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
+ checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
+```
+
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Citation
+
+```BibTeX
+@inproceedings{feichtenhofer2019slowfast,
+ title={Slowfast networks for video recognition},
+ author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming},
+ booktitle={ICCV},
+ pages={6202--6211},
+ year={2019}
+}
+```
+
+```BibTeX
+@inproceedings{gu2018ava,
+ title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
+ author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
+ booktitle={CVPR},
+ pages={6047--6056},
+ year={2018}
+}
+```
diff --git a/configs/detection/slowfast/metafile.yml b/configs/detection/slowfast/metafile.yml
new file mode 100644
index 0000000000..2ab6c44a45
--- /dev/null
+++ b/configs/detection/slowfast/metafile.yml
@@ -0,0 +1,121 @@
+Collections:
+ - Name: SlowFast
+ README: configs/detection/slowfast/README.md
+ Paper:
+ URL: https://arxiv.org/abs/1812.03982
+ Title: 'SlowFast Networks for Video Recognition'
+
+Models:
+ - Name: slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
+ Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+ In Collection: SlowFast
+ Metadata:
+ Architecture: ResNet50
+ Batch Size: 16
+ Epochs: 20
+ Pretrained: Kinetics-400
+ Training Data: AVA v2.1
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: AVA v2.1
+ Task: Action Detection
+ Metrics:
+ mAP: 24.32
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-5180ea3c.pth
+
+ - Name: slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb
+ Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py
+ In Collection: SlowFast
+ Metadata:
+ Architecture: ResNet50
+ Batch Size: 16
+ Epochs: 20
+ Pretrained: Kinetics-400
+ Training Data: AVA v2.1
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: AVA v2.1
+ Task: Action Detection
+ Metrics:
+ mAP: 25.34
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb_20220906-5bb4f6f2.pth
+
+ - Name: slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb
+ Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
+ In Collection: SlowFast
+ Metadata:
+ Architecture: ResNet50
+ Batch Size: 8
+ Epochs: 20
+ Pretrained: Kinetics-400
+ Training Data: AVA v2.1
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: AVA v2.1
+ Task: Action Detection
+ Metrics:
+ mAP: 25.80
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb_20220906-39133ec7.pth
+
+ - Name: slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb
+ Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
+ In Collection: SlowFast
+ Metadata:
+ Architecture: ResNet50
+ Batch Size: 6
+ Epochs: 10
+ Pretrained: Kinetics-400
+ Training Data: AVA v2.2
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: AVA v2.2
+ Task: Action Detection
+ Metrics:
+ mAP: 25.90
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-d934a48f.pth
+
+ - Name: slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb
+ Config: configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py
+ In Collection: SlowFast
+ Metadata:
+ Architecture: ResNet50
+ Batch Size: 6
+ Epochs: 10
+ Pretrained: Kinetics-400
+ Training Data: AVA v2.2
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: AVA v2.2
+ Task: Action Detection
+ Metrics:
+ mAP: 26.41
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-13a9078e.pth
+
+ - Name: slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb
+ Config: configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py
+ In Collection: SlowFast
+ Metadata:
+ Architecture: ResNet50
+ Batch Size: 6
+ Epochs: 10
+ Pretrained: Kinetics-400
+ Training Data: AVA v2.2
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: AVA v2.2
+ Task: Action Detection
+ Metrics:
+ mAP: 26.65
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb_20220906-dd59e26f.pth
diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py
similarity index 100%
rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py
rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-context_8xb16-4x16x1-20e_ava21-rgb.py
diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py
similarity index 100%
rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py
rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50-temporal-max_8xb6-8x8x1-cosine-10e_ava22-rgb.py
diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
similarity index 52%
rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
index 8b5550aec0..0eb0e501e3 100644
--- a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
@@ -1,14 +1,16 @@
-_base_ = ['slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py']
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/'
+ 'slowfast_r50_4x16x1_256e_kinetics400_rgb/'
+ 'slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth')
model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
backbone=dict(
- _delete_=True,
- type='ResNet3dSlowFast',
- _scope_='mmaction',
- pretrained=(
- 'https://download.openmmlab.com/mmaction/recognition/slowfast/'
- 'slowfast_r50_4x16x1_256e_kinetics400_rgb/'
- 'slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth'),
+ type='mmaction.ResNet3dSlowFast',
+ pretrained=None,
resample_rate=8,
speed_ratio=8,
channel_ratio=8,
@@ -33,7 +35,39 @@
conv1_stride_t=1,
pool1_stride_t=1,
spatial_strides=(1, 2, 2, 1))),
- roi_head=dict(bbox_head=dict(in_channels=2304)))
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2304,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
@@ -51,9 +85,10 @@
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+file_client_args = dict(io_backend='disk')
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
- dict(type='RawFrameDecode'),
+ dict(type='RawFrameDecode', **file_client_args),
dict(type='RandomRescale', scale_range=(256, 320)),
dict(type='RandomCrop', size=256),
dict(type='Flip', flip_ratio=0.5),
@@ -65,7 +100,7 @@
val_pipeline = [
dict(
type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True),
- dict(type='RawFrameDecode'),
+ dict(type='RawFrameDecode', **file_client_args),
dict(type='Resize', scale=(-1, 256)),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
dict(type='PackActionInputs')
@@ -99,3 +134,36 @@
data_prefix=dict(img=data_root),
test_mode=True))
test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='AVAMetric',
+ ann_file=ann_file_val,
+ label_file=label_file,
+ exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ by_epoch=True,
+ milestones=[10, 15],
+ gamma=0.1)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
+ clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
similarity index 52%
rename from configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
rename to configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
index a7f4c09ed1..debeb5c7fd 100644
--- a/configs/detection/ava/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
+++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
@@ -1,11 +1,74 @@
-_base_ = ['slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py']
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/'
+ 'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
+ 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')
model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
backbone=dict(
- pretrained=(
- 'https://download.openmmlab.com/mmaction/recognition/slowfast/'
- 'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
- 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')))
+ type='mmaction.ResNet3dSlowFast',
+ resample_rate=4,
+ speed_ratio=4,
+ channel_ratio=8,
+ pretrained=None,
+ slow_pathway=dict(
+ type='resnet3d',
+ depth=50,
+ pretrained=None,
+ lateral=True,
+ conv1_kernel=(1, 7, 7),
+ dilations=(1, 1, 1, 1),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ inflate=(0, 0, 1, 1),
+ spatial_strides=(1, 2, 2, 1),
+ fusion_kernel=7),
+ fast_pathway=dict(
+ type='resnet3d',
+ depth=50,
+ pretrained=None,
+ lateral=False,
+ base_channels=8,
+ conv1_kernel=(5, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1))),
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2304,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
@@ -23,9 +86,10 @@
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+file_client_args = dict(io_backend='disk')
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
- dict(type='RawFrameDecode'),
+ dict(type='RawFrameDecode', **file_client_args),
dict(type='RandomRescale', scale_range=(256, 320)),
dict(type='RandomCrop', size=256),
dict(type='Flip', flip_ratio=0.5),
@@ -36,7 +100,7 @@
val_pipeline = [
dict(
type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True),
- dict(type='RawFrameDecode'),
+ dict(type='RawFrameDecode', **file_client_args),
dict(type='Resize', scale=(-1, 256)),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
dict(type='PackActionInputs')
@@ -80,6 +144,8 @@
train_cfg = dict(
type='EpochBasedTrainLoop', max_epochs=10, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
param_scheduler = [
dict(
@@ -102,3 +168,9 @@
optim_wrapper = dict(
optimizer=dict(type='SGD', lr=0.075, momentum=0.9, weight_decay=0.00001),
clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (6 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
new file mode 100644
index 0000000000..1e94a10960
--- /dev/null
+++ b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
@@ -0,0 +1,171 @@
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowfast/'
+ 'slowfast_r50_8x8x1_256e_kinetics400_rgb/'
+ 'slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth')
+
+model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
+ backbone=dict(
+ type='mmaction.ResNet3dSlowFast',
+ resample_rate=4,
+ speed_ratio=4,
+ channel_ratio=8,
+ pretrained=None,
+ slow_pathway=dict(
+ type='resnet3d',
+ depth=50,
+ pretrained=None,
+ lateral=True,
+ conv1_kernel=(1, 7, 7),
+ dilations=(1, 1, 1, 1),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ inflate=(0, 0, 1, 1),
+ spatial_strides=(1, 2, 2, 1),
+ fusion_kernel=7),
+ fast_pathway=dict(
+ type='resnet3d',
+ depth=50,
+ pretrained=None,
+ lateral=False,
+ base_channels=8,
+ conv1_kernel=(5, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1))),
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2304,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+ 'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+ dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
+ dict(type='RawFrameDecode', **file_client_args),
+ dict(type='RandomRescale', scale_range=(256, 320)),
+ dict(type='RandomCrop', size=256),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW', collapse=True),
+ dict(type='PackActionInputs')
+]
+
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+ dict(
+ type='SampleAVAFrames', clip_len=32, frame_interval=2, test_mode=True),
+ dict(type='RawFrameDecode', **file_client_args),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(type='FormatShape', input_format='NCTHW', collapse=True),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ exclude_file=exclude_file_train,
+ pipeline=train_pipeline,
+ label_file=label_file,
+ proposal_file=proposal_file_train,
+ data_prefix=dict(img=data_root)))
+
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ exclude_file=exclude_file_val,
+ pipeline=val_pipeline,
+ label_file=label_file,
+ proposal_file=proposal_file_val,
+ data_prefix=dict(img=data_root),
+ test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='AVAMetric',
+ ann_file=ann_file_val,
+ label_file=label_file,
+ exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ by_epoch=True,
+ milestones=[10, 15],
+ gamma=0.1)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.00001),
+ clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=64)
diff --git a/configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py
similarity index 100%
rename from configs/detection/ava/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py
rename to configs/detection/slowfast/slowfast_r50-k400-pre-temporal-max-focal-alpha3-gamma1_8xb6-8x8x1-cosine-10e_ava22-rgb.py
diff --git a/configs/detection/slowonly/README.md b/configs/detection/slowonly/README.md
new file mode 100644
index 0000000000..ff0f7bf641
--- /dev/null
+++ b/configs/detection/slowonly/README.md
@@ -0,0 +1,126 @@
+# SlowOnly
+
+[Slowfast networks for video recognition](https://openaccess.thecvf.com/content_ICCV_2019/html/Feichtenhofer_SlowFast_Networks_for_Video_Recognition_ICCV_2019_paper.html)
+
+
+
+## Abstract
+
+
+
+We present SlowFast networks for video recognition. Our model involves (i) a Slow pathway, operating at low frame rate, to capture spatial semantics, and (ii) a Fast pathway, operating at high frame rate, to capture motion at fine temporal resolution. The Fast pathway can be made very lightweight by reducing its channel capacity, yet can learn useful temporal information for video recognition. Our models achieve strong performance for both action classification and detection in video, and large improvements are pin-pointed as contributions by our SlowFast concept. We report state-of-the-art accuracy on major video recognition benchmarks, Kinetics, Charades and AVA.
+
+
+
+
+
+
+
+## Results and Models
+
+### AVA2.1
+
+| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log |
+| :---------------------: | :--: | :------------------------------------: | :----------: | :---: | :---------------------------------------: | :-------------------------------------: | :------------------------------------: |
+| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-400 | 20.72 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
+| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 22.77 | [config](/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log) |
+| 4x16x1 | 8 | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 21.55 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log) |
+| 8x8x1 | 8 | SlowOnly ResNet50 (NonLocalEmbedGauss) | Kinetics-400 | 23.77 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log) |
+| 8x8x1 | 8 | SlowOnly ResNet101 | Kinetics-400 | 24.83 | [config](/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log) |
+
+### AVA2.2 (Trained on AVA-Kinetics)
+
+Currently, we only use the training set of AVA-Kinetics and evaluate on the AVA2.2 validation dataset. The AVA-Kinetics validation dataset will be supported soon.
+
+| frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log |
+| :---------------------: | :--: | :---------------: | :----------: | :---: | :----------------------------------------------: | :--------------------------------------------: | :-------------------------------------------: |
+| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-400 | 24.53 | [config](/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-33e3ca7c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) |
+| 4x16x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 25.87 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb_20221205-a07e8c15.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.log) |
+| 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-400 | 26.10 | [config](/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-8f8dff3b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+| 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+
+### AVA2.2 (Trained on AVA-Kinetics with tricks)
+
+We conduct ablation studies to show the improvements of training tricks using SlowOnly8x8 pretrained on the Kinetics700 dataset. The baseline is the last row in **AVA2.2 (Trained on AVA-Kinetics)**.
+
+| method | frame sampling strategy | gpus | backbone | pretrain | mAP | config | ckpt | log |
+| :--------------------: | :---------------------: | :--: | :---------------: | :----------: | :---: | :--------------------------------------: | :-------------------------------------: | :------------------------------------: |
+| baseline | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 27.82 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-16a01c37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+| + context | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 28.31 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5d514f8c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+| + temporal max pooling | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 28.48 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-5b5e71eb.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+| + nonlinear head | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 29.83 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb_20221205-87624265.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.log) |
+| + focal loss | 8x8x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 30.33 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb_20221205-37aa8395.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.log) |
+| + more frames | 16x4x1 | 8 | SlowOnly ResNet50 | Kinetics-700 | 31.29 | [config](/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb_20221205-dd652f81.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.log) |
+
+1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
+2. **+ context** indicates that using both RoI feature and global pooled feature for classification; **+ temporal max pooling** indicates that using max pooling in the temporal dimension for the feature; **nonlinear head** indicates that using a 2-layer mlp instead of a linear classifier.
+
+For more details on data preparation, you can refer to
+
+- [AVA](/tools/data/ava/README.md)
+- [AVA-Kinetics](/tools/data/ava_kinetics/README.md)
+
+## Train
+
+You can use the following command to train a model.
+
+```shell
+python tools/train.py ${CONFIG_FILE} [optional arguments]
+```
+
+Example: train the SlowOnly model on AVA2.1 in a deterministic option with periodic validation.
+
+```shell
+python tools/train.py configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
+ --seed 0 --deterministic
+```
+
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Test
+
+You can use the following command to test a model.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+Example: test the SlowOnly model on AVA2.1 and dump the result to a pkl file.
+
+```shell
+python tools/test.py configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py \
+ checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
+```
+
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
+
+## Citation
+
+```BibTeX
+@inproceedings{feichtenhofer2019slowfast,
+ title={Slowfast networks for video recognition},
+ author={Feichtenhofer, Christoph and Fan, Haoqi and Malik, Jitendra and He, Kaiming},
+ booktitle={ICCV},
+ pages={6202--6211},
+ year={2019}
+}
+```
+
+```BibTeX
+@inproceedings{gu2018ava,
+ title={Ava: A video dataset of spatio-temporally localized atomic visual actions},
+ author={Gu, Chunhui and Sun, Chen and Ross, David A and Vondrick, Carl and Pantofaru, Caroline and Li, Yeqing and Vijayanarasimhan, Sudheendra and Toderici, George and Ricco, Susanna and Sukthankar, Rahul and others},
+ booktitle={CVPR},
+ pages={6047--6056},
+ year={2018}
+}
+```
+
+```BibTeX
+@article{li2020ava,
+ title={The ava-kinetics localized human actions video dataset},
+ author={Li, Ang and Thotakuri, Meghana and Ross, David A and Carreira, Jo{\~a}o and Vostrikov, Alexander and Zisserman, Andrew},
+ journal={arXiv preprint arXiv:2005.00214},
+ year={2020}
+}
+```
diff --git a/configs/detection/slowonly/metafile.yml b/configs/detection/slowonly/metafile.yml
new file mode 100644
index 0000000000..11ca749351
--- /dev/null
+++ b/configs/detection/slowonly/metafile.yml
@@ -0,0 +1,102 @@
+Collections:
+ - Name: SlowOnly
+ README: configs/detection/slowonly/README.md
+ Paper:
+ URL: https://arxiv.org/abs/1812.03982
+ Title: 'SlowFast Networks for Video Recognition'
+
+Models:
+ - Name: slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
+ Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+ In Collection: SlowOnly
+ Metadata:
+ Architecture: ResNet50
+ Batch Size: 16
+ Epochs: 20
+ Pretrained: Kinetics-400
+ Training Data: AVA v2.1
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: AVA v2.1
+ Task: Action Detection
+ Metrics:
+ mAP: 20.72
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-953ef5fe.pth
+
+ - Name: slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb
+ Config: configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+ In Collection: SlowOnly
+ Metadata:
+ Architecture: ResNet50
+ Batch Size: 16
+ Epochs: 20
+ Pretrained: Kinetics-700
+ Training Data: AVA v2.1
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: AVA v2.1
+ Task: Action Detection
+ Metrics:
+ mAP: 22.77
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb_20220906-b3b6d44e.pth
+
+ - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb
+ Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
+ In Collection: SlowOnly
+ Metadata:
+ Architecture: ResNet50
+ Batch Size: 16
+ Epochs: 20
+ Pretrained: Kinetics-400
+ Training Data: AVA v2.1
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: AVA v2.1
+ Task: Action Detection
+ Metrics:
+ mAP: 21.55
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb_20220906-5ae3f91b.pth
+
+ - Name: slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb
+ Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
+ In Collection: SlowOnly
+ Metadata:
+ Architecture: ResNet50
+ Batch Size: 16
+ Epochs: 20
+ Pretrained: Kinetics-400
+ Training Data: AVA v2.1
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: AVA v2.1
+ Task: Action Detection
+ Metrics:
+ mAP: 23.77
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb_20220906-9760eadb.pth
+
+ - Name: slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb
+ Config: configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
+ In Collection: SlowOnly
+ Metadata:
+ Architecture: ResNet101
+ Batch Size: 16
+ Epochs: 20
+ Pretrained: Kinetics-400
+ Training Data: AVA v2.1
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: AVA v2.1
+ Task: Action Detection
+ Metrics:
+ mAP: 24.83
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb_20220906-43f16877.pth
diff --git a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
similarity index 65%
rename from configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
index 7407ec6978..fd44f336ac 100644
--- a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
+++ b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
@@ -1,6 +1,58 @@
-_base_ = [
- '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+ 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
+ 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+ 'kinetics400-rgb_20220901-e7b65fad.pth')
+
+model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
+ backbone=dict(
+ type='mmaction.ResNet3dSlowOnly',
+ depth=50,
+ pretrained=None,
+ pretrained2d=False,
+ lateral=False,
+ num_stages=4,
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1)),
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2048,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
dataset_type = 'AVAKineticsDataset'
data_root = 'data/ava_kinetics/rawframes'
@@ -18,14 +70,7 @@
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
-# file_client_args = dict(
-# io_backend='petrel',
-# path_mapping=dict({
-# 'data/ava_kinetics/rawframes/':
-# 's3://openmmlab/datasets/action/ava/rawframes/'
-# }))
file_client_args = dict(io_backend='disk')
-
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
dict(type='RawFrameDecode', **file_client_args),
diff --git a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
similarity index 70%
rename from configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
index eb393d3a8c..4af750e8ad 100644
--- a/configs/detection/ava_kinetics/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
+++ b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
@@ -1,13 +1,58 @@
-_base_ = [
- '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
+_base_ = '../../_base_/default_runtime.py'
url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
'slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_kinetics400-'
'rgb/slowonly_imagenet-pretrained-r50_8xb16-8x8x1-steplr-150e_'
'kinetics400-rgb_20220901-df42dc84.pth')
-model = dict(init_cfg=dict(type='Pretrained', checkpoint=url))
+model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
+ backbone=dict(
+ type='mmaction.ResNet3dSlowOnly',
+ depth=50,
+ pretrained=None,
+ pretrained2d=False,
+ lateral=False,
+ num_stages=4,
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1)),
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2048,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
dataset_type = 'AVAKineticsDataset'
data_root = 'data/ava_kinetics/rawframes'
@@ -25,14 +70,7 @@
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
-# file_client_args = dict(
-# io_backend='petrel',
-# path_mapping=dict({
-# 'data/ava_kinetics/rawframes/':
-# 's3://openmmlab/datasets/action/ava/rawframes/'
-# }))
file_client_args = dict(io_backend='disk')
-
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
dict(type='RawFrameDecode', **file_client_args),
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max-nl-head_8xb8-8x8x1-focal-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50-context-temporal-max_8xb8-8x8x1-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50-context_8xb8-8x8x1-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py
similarity index 85%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py
index 4d4a3dea6b..a757f731a4 100644
--- a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py
+++ b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-16x4x1-10e-tricks_ava-kinetics-rgb.py
@@ -1,14 +1,6 @@
-_base_ = [
- '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
-
-url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
- 'slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-'
- 'rgb/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_'
- 'kinetics700-rgb_20221013-15b93b10.pth')
+_base_ = ['slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py']
model = dict(
- init_cfg=dict(type='Pretrained', checkpoint=url),
roi_head=dict(
bbox_roi_extractor=dict(with_global=True, temporal_pool_mode='max'),
bbox_head=dict(in_channels=4096, mlp_head=True, focal_gamma=1.0)))
@@ -29,14 +21,7 @@
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
-# file_client_args = dict(
-# io_backend='petrel',
-# path_mapping=dict({
-# 'data/ava_kinetics/rawframes/':
-# 's3://openmmlab/datasets/action/ava/rawframes/'
-# }))
file_client_args = dict(io_backend='disk')
-
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=16, frame_interval=4),
dict(type='RawFrameDecode', **file_client_args),
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
similarity index 100%
rename from configs/detection/ava_kinetics/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
rename to configs/detection/slowonly/slowonly_k700-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
new file mode 100644
index 0000000000..9bee13a25c
--- /dev/null
+++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
@@ -0,0 +1,151 @@
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+ 'omni/slowonly_r101_without_omni_8x8x1_kinetics400_rgb_'
+ '20200926-0c730aef.pth')
+
+model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
+ backbone=dict(
+ type='mmaction.ResNet3dSlowOnly',
+ depth=101,
+ pretrained=None,
+ pretrained2d=False,
+ lateral=False,
+ num_stages=4,
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1)),
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2048,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+ 'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+ dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
+ dict(type='RawFrameDecode', **file_client_args),
+ dict(type='RandomRescale', scale_range=(256, 320)),
+ dict(type='RandomCrop', size=256),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW', collapse=True),
+ dict(type='PackActionInputs')
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+ dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True),
+ dict(type='RawFrameDecode', **file_client_args),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(type='FormatShape', input_format='NCTHW', collapse=True),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ exclude_file=exclude_file_train,
+ pipeline=train_pipeline,
+ label_file=label_file,
+ proposal_file=proposal_file_train,
+ data_prefix=dict(img=data_root)))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ exclude_file=exclude_file_val,
+ pipeline=val_pipeline,
+ label_file=label_file,
+ proposal_file=proposal_file_val,
+ data_prefix=dict(img=data_root),
+ test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='AVAMetric',
+ ann_file=ann_file_val,
+ label_file=label_file,
+ exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ by_epoch=True,
+ milestones=[10, 15],
+ gamma=0.1)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
+ clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
new file mode 100644
index 0000000000..cdc8ea8d98
--- /dev/null
+++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
@@ -0,0 +1,160 @@
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+ 'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb/'
+ 'slowonly_nl_embedded_gaussian_r50_4x16x1_150e_kinetics400_rgb_'
+ '20210308-0d6e5a69.pth')
+
+model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
+ backbone=dict(
+ type='mmaction.ResNet3dSlowOnly',
+ depth=50,
+ pretrained=None,
+ pretrained2d=False,
+ lateral=False,
+ num_stages=4,
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1),
+ norm_cfg=dict(type='BN3d', requires_grad=True),
+ non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
+ non_local_cfg=dict(
+ sub_sample=True,
+ use_scale=True,
+ norm_cfg=dict(type='BN3d', requires_grad=True),
+ mode='embedded_gaussian')),
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2048,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+ 'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+ dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+ dict(type='RawFrameDecode', **file_client_args),
+ dict(type='RandomRescale', scale_range=(256, 320)),
+ dict(type='RandomCrop', size=256),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW', collapse=True),
+ dict(type='PackActionInputs')
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+ dict(
+ type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+ dict(type='RawFrameDecode', **file_client_args),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(type='FormatShape', input_format='NCTHW', collapse=True),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ exclude_file=exclude_file_train,
+ pipeline=train_pipeline,
+ label_file=label_file,
+ proposal_file=proposal_file_train,
+ data_prefix=dict(img=data_root)))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ exclude_file=exclude_file_val,
+ pipeline=val_pipeline,
+ label_file=label_file,
+ proposal_file=proposal_file_val,
+ data_prefix=dict(img=data_root),
+ test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='AVAMetric',
+ ann_file=ann_file_val,
+ label_file=label_file,
+ exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ by_epoch=True,
+ milestones=[10, 15],
+ gamma=0.1)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
+ clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
new file mode 100644
index 0000000000..9b6dd00fdb
--- /dev/null
+++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
@@ -0,0 +1,159 @@
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/recognition/slowonly/'
+ 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb/'
+ 'slowonly_nl_embedded_gaussian_r50_8x8x1_150e_kinetics400_rgb_'
+ '20210308-e8dd9e82.pth')
+
+model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
+ backbone=dict(
+ type='mmaction.ResNet3dSlowOnly',
+ depth=50,
+ pretrained=None,
+ pretrained2d=False,
+ lateral=False,
+ num_stages=4,
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1),
+ norm_cfg=dict(type='BN3d', requires_grad=True),
+ non_local=((0, 0, 0), (1, 0, 1, 0), (1, 0, 1, 0, 1, 0), (0, 0, 0)),
+ non_local_cfg=dict(
+ sub_sample=True,
+ use_scale=True,
+ norm_cfg=dict(type='BN3d', requires_grad=True),
+ mode='embedded_gaussian')),
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2048,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+ 'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+ dict(type='SampleAVAFrames', clip_len=8, frame_interval=8),
+ dict(type='RawFrameDecode', **file_client_args),
+ dict(type='RandomRescale', scale_range=(256, 320)),
+ dict(type='RandomCrop', size=256),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW', collapse=True),
+ dict(type='PackActionInputs')
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+ dict(type='SampleAVAFrames', clip_len=8, frame_interval=8, test_mode=True),
+ dict(type='RawFrameDecode', **file_client_args),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(type='FormatShape', input_format='NCTHW', collapse=True),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ exclude_file=exclude_file_train,
+ pipeline=train_pipeline,
+ label_file=label_file,
+ proposal_file=proposal_file_train,
+ data_prefix=dict(img=data_root)))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ exclude_file=exclude_file_val,
+ pipeline=val_pipeline,
+ label_file=label_file,
+ proposal_file=proposal_file_val,
+ data_prefix=dict(img=data_root),
+ test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='AVAMetric',
+ ann_file=ann_file_val,
+ label_file=label_file,
+ exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ by_epoch=True,
+ milestones=[10, 15],
+ gamma=0.1)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
+ clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
similarity index 56%
rename from configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
rename to configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
index ec107941b3..a83408c84a 100644
--- a/configs/detection/ava/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
+++ b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
@@ -1,6 +1,58 @@
-_base_ = [
- '../../_base_/default_runtime.py', '../_base_/models/slowonly_r50.py'
-]
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly/'
+ 'slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-'
+ 'rgb/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+ 'kinetics400-rgb_20220901-e7b65fad.pth')
+
+model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
+ backbone=dict(
+ type='mmaction.ResNet3dSlowOnly',
+ depth=50,
+ pretrained=None,
+ pretrained2d=False,
+ lateral=False,
+ num_stages=4,
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1)),
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2048,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
dataset_type = 'AVADataset'
data_root = 'data/ava/rawframes'
@@ -18,9 +70,10 @@
'recall_93.9.pkl')
proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+file_client_args = dict(io_backend='disk')
train_pipeline = [
dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
- dict(type='RawFrameDecode'),
+ dict(type='RawFrameDecode', **file_client_args),
dict(type='RandomRescale', scale_range=(256, 320)),
dict(type='RandomCrop', size=256),
dict(type='Flip', flip_ratio=0.5),
@@ -31,7 +84,7 @@
val_pipeline = [
dict(
type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
- dict(type='RawFrameDecode'),
+ dict(type='RawFrameDecode', **file_client_args),
dict(type='Resize', scale=(-1, 256)),
dict(type='FormatShape', input_format='NCTHW', collapse=True),
dict(type='PackActionInputs')
@@ -92,3 +145,9 @@
optim_wrapper = dict(
optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
new file mode 100644
index 0000000000..a68893a015
--- /dev/null
+++ b/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
@@ -0,0 +1,153 @@
+_base_ = '../../_base_/default_runtime.py'
+
+url = ('https://download.openmmlab.com/mmaction/v1.0/recognition/slowonly'
+ '/slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_'
+ 'kinetics700-rgb/slowonly_imagenet-pretrained-r50_16xb16-4x16x1-'
+ 'steplr-150e_kinetics700-rgb_20220901-f73b3e89.pth')
+
+model = dict(
+ type='FastRCNN',
+ _scope_='mmdet',
+ init_cfg=dict(type='Pretrained', checkpoint=url),
+ backbone=dict(
+ type='mmaction.ResNet3dSlowOnly',
+ depth=50,
+ pretrained=None,
+ pretrained2d=False,
+ lateral=False,
+ num_stages=4,
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ spatial_strides=(1, 2, 2, 1)),
+ roi_head=dict(
+ type='AVARoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor3D',
+ roi_layer_type='RoIAlign',
+ output_size=8,
+ with_temporal_pool=True),
+ bbox_head=dict(
+ type='BBoxHeadAVA',
+ in_channels=2048,
+ num_classes=81,
+ multilabel=True,
+ dropout_ratio=0.5)),
+ data_preprocessor=dict(
+ type='mmaction.ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ train_cfg=dict(
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssignerAVA',
+ pos_iou_thr=0.9,
+ neg_iou_thr=0.9,
+ min_pos_iou=0.9),
+ sampler=dict(
+ type='RandomSampler',
+ num=32,
+ pos_fraction=1,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=1.0)),
+ test_cfg=dict(rcnn=None))
+
+dataset_type = 'AVADataset'
+data_root = 'data/ava/rawframes'
+anno_root = 'data/ava/annotations'
+
+ann_file_train = f'{anno_root}/ava_train_v2.1.csv'
+ann_file_val = f'{anno_root}/ava_val_v2.1.csv'
+
+exclude_file_train = f'{anno_root}/ava_train_excluded_timestamps_v2.1.csv'
+exclude_file_val = f'{anno_root}/ava_val_excluded_timestamps_v2.1.csv'
+
+label_file = f'{anno_root}/ava_action_list_v2.1_for_activitynet_2018.pbtxt'
+
+proposal_file_train = (f'{anno_root}/ava_dense_proposals_train.FAIR.'
+ 'recall_93.9.pkl')
+proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+ dict(type='SampleAVAFrames', clip_len=4, frame_interval=16),
+ dict(type='RawFrameDecode', **file_client_args),
+ dict(type='RandomRescale', scale_range=(256, 320)),
+ dict(type='RandomCrop', size=256),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW', collapse=True),
+ dict(type='PackActionInputs')
+]
+# The testing is w/o. any cropping / flipping
+val_pipeline = [
+ dict(
+ type='SampleAVAFrames', clip_len=4, frame_interval=16, test_mode=True),
+ dict(type='RawFrameDecode', **file_client_args),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(type='FormatShape', input_format='NCTHW', collapse=True),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ exclude_file=exclude_file_train,
+ pipeline=train_pipeline,
+ label_file=label_file,
+ proposal_file=proposal_file_train,
+ data_prefix=dict(img=data_root)))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ exclude_file=exclude_file_val,
+ pipeline=val_pipeline,
+ label_file=label_file,
+ proposal_file=proposal_file_val,
+ data_prefix=dict(img=data_root),
+ test_mode=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+ type='AVAMetric',
+ ann_file=ann_file_val,
+ label_file=label_file,
+ exclude_file=exclude_file_val)
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ by_epoch=True,
+ milestones=[10, 15],
+ gamma=0.1)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.00001),
+ clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/localization/bmn/README.md b/configs/localization/bmn/README.md
index 834df03ad5..ec2f625a95 100644
--- a/configs/localization/bmn/README.md
+++ b/configs/localization/bmn/README.md
@@ -39,20 +39,20 @@ For more details on data preparation, you can refer to [ActivityNet Data Prepara
Train BMN model on ActivityNet features dataset.
```shell
-bash tools/dist_train.sh configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py 2
+bash tools/dist_train.sh configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py 2
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
Test BMN on ActivityNet feature dataset.
```shell
-python3 tools/test.py configs/localization/bmn/bmn_400x100_2x8_9e_activitynet_feature.py CHECKPOINT.PTH
+python3 tools/test.py configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py CHECKPOINT.PTH
```
-For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Testing** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/localization/bsn/README.md b/configs/localization/bsn/README.md
index 62c46f6782..efd2d2c0d0 100644
--- a/configs/localization/bsn/README.md
+++ b/configs/localization/bsn/README.md
@@ -42,7 +42,7 @@ python3 tools/train.py configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activi
After training use the TEM module to generate the probabilities sequence (actionness, starting, and ending) for the training and validation dataset:
```shell
-python tools/test.py configs/localization/bsn/bsn_tem_400x100_1xb16_20e_activitynet_feature.py \
+python tools/test.py configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py \
work_dirs/bsn_400x100_20e_1xb16_activitynet_feature/tem_epoch_20.pth
```
diff --git a/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py b/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py
index 28595bb786..285306f976 100644
--- a/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py
+++ b/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py
@@ -89,3 +89,5 @@
metric_type='TEM',
dump_config=dict(out=tem_results_dir, output_format='csv'))
val_evaluator = test_evaluator
+
+default_hooks = dict(checkpoint=dict(filename_tmpl='tem_epoch_{}.pth'))
diff --git a/configs/recognition/c2d/README.md b/configs/recognition/c2d/README.md
index 651193dad2..a1b58493f7 100644
--- a/configs/recognition/c2d/README.md
+++ b/configs/recognition/c2d/README.md
@@ -49,7 +49,7 @@ python tools/train.py configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_
--seed 0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -66,7 +66,7 @@ python tools/test.py configs/recognition/c2d/c2d_r50-in1k-pre_8xb32-8x8x1-100e_k
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/c3d/README.md b/configs/recognition/c3d/README.md
index 958119f048..9e2af4229e 100644
--- a/configs/recognition/c3d/README.md
+++ b/configs/recognition/c3d/README.md
@@ -44,7 +44,7 @@ python tools/train.py configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -61,7 +61,7 @@ python tools/test.py configs/recognition/c3d_sports1m-pretrained_8xb30-16x1x1-45
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/csn/README.md b/configs/recognition/csn/README.md
index 77c3aaf900..b09e365829 100644
--- a/configs/recognition/csn/README.md
+++ b/configs/recognition/csn/README.md
@@ -52,7 +52,7 @@ python tools/train.py configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -69,7 +69,7 @@ python tools/test.py configs/recognition/csn/ircsn_ig65m-pretrained-r152_8xb12-3
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/i3d/README.md b/configs/recognition/i3d/README.md
index e181eaf195..a6e0aebccd 100644
--- a/configs/recognition/i3d/README.md
+++ b/configs/recognition/i3d/README.md
@@ -51,7 +51,7 @@ python tools/train.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-3
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -68,7 +68,7 @@ python tools/test.py configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/mvit/README.md b/configs/recognition/mvit/README.md
index 15f8723615..33527c8408 100644
--- a/configs/recognition/mvit/README.md
+++ b/configs/recognition/mvit/README.md
@@ -92,7 +92,7 @@ python tools/test.py configs/recognition/mvit/mvit-small-p244_16x4x1_kinetics400
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/omnisource/README.md b/configs/recognition/omnisource/README.md
index 64acf52c35..f3397d3bb1 100644
--- a/configs/recognition/omnisource/README.md
+++ b/configs/recognition/omnisource/README.md
@@ -47,7 +47,7 @@ python tools/train.py configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-25
We found that the training of this Omnisource model could crash for unknown reasons. If this happens, you can resume training by adding the `--cfg-options resume=True` to the training script.
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -64,7 +64,7 @@ python tools/test.py configs/recognition/omnisource/slowonly_r50_8xb16-8x8x1-256
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/r2plus1d/README.md b/configs/recognition/r2plus1d/README.md
index 29a619e696..d9e216f41a 100644
--- a/configs/recognition/r2plus1d/README.md
+++ b/configs/recognition/r2plus1d/README.md
@@ -45,7 +45,7 @@ python tools/train.py configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -62,7 +62,7 @@ python tools/test.py configs/recognition/r2plus1d/r2plus1d_r34_8xb8-8x8x1-180e_k
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/slowfast/README.md b/configs/recognition/slowfast/README.md
index 3bf1666152..0cd2ccd8d3 100644
--- a/configs/recognition/slowfast/README.md
+++ b/configs/recognition/slowfast/README.md
@@ -48,7 +48,7 @@ python tools/train.py configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -65,7 +65,7 @@ python tools/test.py configs/recognition/slowfast/slowfast_r50_8xb8-4x16x1-256e_
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/slowonly/README.md b/configs/recognition/slowonly/README.md
index bf5ce3781d..78a3e043e3 100644
--- a/configs/recognition/slowonly/README.md
+++ b/configs/recognition/slowonly/README.md
@@ -57,7 +57,7 @@ python tools/train.py configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -74,7 +74,7 @@ python tools/test.py configs/recognition/slowonly/slowonly_r50_8xb16-4x16x1-256e
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/swin/README.md b/configs/recognition/swin/README.md
index 1e6074c4a9..1156c4a679 100644
--- a/configs/recognition/swin/README.md
+++ b/configs/recognition/swin/README.md
@@ -55,7 +55,7 @@ python tools/train.py configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -72,7 +72,7 @@ python tools/test.py configs/recognition/swin/swin-tiny-p244-w877_in1k-pre_8xb8-
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/tanet/README.md b/configs/recognition/tanet/README.md
index 1a67a40aa0..a72a7bde4f 100644
--- a/configs/recognition/tanet/README.md
+++ b/configs/recognition/tanet/README.md
@@ -55,7 +55,7 @@ python tools/train.py configs/recognition/tanet/tanet_imagenet-pretrained-r50_8x
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -72,7 +72,7 @@ python tools/test.py configs/recognition/tanet/tanet_imagenet-pretrained-r50_8xb
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/timesformer/README.md b/configs/recognition/timesformer/README.md
index df197e0ba9..6d8e148bd8 100644
--- a/configs/recognition/timesformer/README.md
+++ b/configs/recognition/timesformer/README.md
@@ -47,7 +47,7 @@ python tools/train.py configs/recognition/timesformer/timesformer_divST_8xb8-8x3
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -64,7 +64,7 @@ python tools/test.py configs/recognition/timesformer/timesformer_divST_8xb8-8x32
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/tin/README.md b/configs/recognition/tin/README.md
index 17a30d7b03..abadd02f4f 100644
--- a/configs/recognition/tin/README.md
+++ b/configs/recognition/tin/README.md
@@ -67,7 +67,7 @@ python tools/train.py configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1
--work-dir work_dirs/tin_imagenet-pretrained-r50_8xb6-1x1x8-40e_sthv1-rgb randomness.seed=0 randomness.deterministic=True
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -84,7 +84,7 @@ python tools/test.py configs/recognition/tin/tin_imagenet-pretrained-r50_8xb6-1x
checkpoints/SOME_CHECKPOINT.pth --dump result.json
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/tpn/README.md b/configs/recognition/tpn/README.md
index 972dbcbc7b..cb1af4b6b2 100644
--- a/configs/recognition/tpn/README.md
+++ b/configs/recognition/tpn/README.md
@@ -29,7 +29,7 @@ Visual tempo characterizes the dynamics and the temporal scale of an action. Mod
| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | reference top1 acc | reference top5 acc | testing protocol | inference time(video/s) | gpu_mem(M) | config | ckpt | log |
| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----------------: | :--------------: | :---------------------: | :--------: | :---------------: | :-------------: | :------------: |
-| 1x1x8 | height 100 | 8x6 | ResNet50 | TSM | 48.98 | 78.91 | x | x | 8 clips x 3 crop | x | 8828 | [config](/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20220913-d2f5c300.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log) |
+| 1x1x8 | height 100 | 8x6 | ResNet50 | TSM | 51.87 | 79.67 | x | x | 8 clips x 3 crop | x | 8828 | [config](/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20230221-940a3615.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log) |
:::{note}
@@ -58,7 +58,7 @@ python tools/train.py configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_k
--work-dir work_dirs/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb [--validate --seed 0 --deterministic]
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -75,7 +75,7 @@ python tools/test.py configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_ki
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/tpn/metafile.yml b/configs/recognition/tpn/metafile.yml
index 702da581e0..ce953f2e89 100644
--- a/configs/recognition/tpn/metafile.yml
+++ b/configs/recognition/tpn/metafile.yml
@@ -66,8 +66,8 @@ Models:
Results:
- Dataset: SthV1
Metrics:
- Top 1 Accuracy: 48.98
- Top 5 Accuracy: 78.91
+ Top 1 Accuracy: 51.87
+ Top 5 Accuracy: 79.67
Task: Action Recognition
Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20220913-d2f5c300.pth
+ Weights: (https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb_20230221-940a3615.pth
diff --git a/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py b/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py
index d833687d6a..b614d725f7 100644
--- a/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py
+++ b/configs/recognition/tpn/tpn-tsm_imagenet-pretrained-r50_8xb8-1x1x8-150e_sthv1-rgb.py
@@ -8,12 +8,14 @@
ann_file_train = 'data/sthv1/sthv1_train_list_rawframes.txt'
ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt'
ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt'
+
+sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52}
train_pipeline = [
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
dict(type='RawFrameDecode'),
dict(type='RandomResizedCrop'),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
- dict(type='Flip', flip_ratio=0.5),
+ dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv1_flip_label_map),
dict(type='ColorJitter'),
dict(type='FormatShape', input_format='NCHW'),
dict(type='PackActionInputs')
diff --git a/configs/recognition/trn/README.md b/configs/recognition/trn/README.md
index 875207dd43..323398acb4 100644
--- a/configs/recognition/trn/README.md
+++ b/configs/recognition/trn/README.md
@@ -52,7 +52,7 @@ python tools/train.py configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -69,7 +69,7 @@ python tools/test.py configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md
index ca490117c3..97c1b33e34 100644
--- a/configs/recognition/tsm/README.md
+++ b/configs/recognition/tsm/README.md
@@ -32,11 +32,11 @@ The explosive growth in video streaming gives rise to challenges on performing v
### Something-something V2
-| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log |
-| :---------------------: | :--------: | :--: | :-------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :---------------------------------: | :-------------------------------: | :------------------------------: |
-| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 60.20 | 86.13 | 8 clips x 10 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20221122-446d261a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) |
-| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 62.46 | 87.75 | 16 clips x 10 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20221122-b1fb8264.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log) |
-| 1x1x8 | 224x224 | 8 | ResNet101 | ImageNet | 60.49 | 85.99 | 8 clips x 10 crop | 62.66G | 42.86M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20221122-cb2cc64e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log) |
+| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log |
+| :---------------------: | :--------: | :--: | :-------: | :------: | :------: | :------: | :---------------: | :----: | :----: | :---------------------------------: | :-------------------------------: | :-------------------------------: |
+| 1x1x8 | 224x224 | 8 | ResNet50 | ImageNet | 62.72 | 87.70 | 8 clips x 3 crop | 32.88G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) |
+| 1x1x16 | 224x224 | 8 | ResNet50 | ImageNet | 64.16 | 88.61 | 16 clips x 3 crop | 65.75G | 23.87M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log) |
+| 1x1x8 | 224x224 | 8 | ResNet101 | ImageNet | 63.70 | 88.28 | 8 clips x 3 crop | 62.66G | 42.86M | [config](/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log) |
1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
2. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
@@ -58,7 +58,7 @@ python tools/train.py configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -75,7 +75,7 @@ python tools/test.py configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/tsm/metafile.yml b/configs/recognition/tsm/metafile.yml
index 5adafb069f..64d37461d4 100644
--- a/configs/recognition/tsm/metafile.yml
+++ b/configs/recognition/tsm/metafile.yml
@@ -178,17 +178,17 @@ Models:
Parameters: 23.87M
Pretrained: ImageNet
Resolution: 224x224
- Training Data: Kinetics-400
+ Training Data: SthV2
Training Resources: 8 GPUs
Modality: RGB
Results:
- - Dataset: Kinetics-400
+ - Dataset: SthV2
Task: Action Recognition
Metrics:
- Top 1 Accuracy: 60.20
- Top 5 Accuracy: 86.13
+ Top 1 Accuracy: 62.72
+ Top 5 Accuracy: 87.70
Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20221122-446d261a.pth
+ Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20230317-be0fc26e.pth
- Name: tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb
Config: configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py
@@ -196,22 +196,22 @@ Models:
Metadata:
Architecture: ResNet50
Batch Size: 16
- Epochs: 100
+ Epochs: 50
FLOPs: 65.75G
Parameters: 23.87M
Pretrained: ImageNet
Resolution: 224x224
- Training Data: Kinetics-400
+ Training Data: SthV2
Training Resources: 8 GPUs
Modality: RGB
Results:
- - Dataset: Kinetics-400
+ - Dataset: SthV2
Task: Action Recognition
Metrics:
- Top 1 Accuracy: 62.46
- Top 5 Accuracy: 87.75
+ Top 1 Accuracy: 64.16
+ Top 5 Accuracy: 88.61
Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20221122-b1fb8264.pth
+ Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb_20230317-ec6696ad.pth
- Name: tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb
Config: configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py
@@ -219,19 +219,19 @@ Models:
Metadata:
Architecture: ResNet101
Batch Size: 16
- Epochs: 100
+ Epochs: 50
FLOPs: 62.66G
Parameters: 42.86M
Pretrained: ImageNet
Resolution: 224x224
- Training Data: Kinetics-400
+ Training Data: SthV2
Training Resources: 8 GPUs
Modality: RGB
Results:
- - Dataset: Kinetics-400
+ - Dataset: SthV2
Task: Action Recognition
Metrics:
- Top 1 Accuracy: 60.49
- Top 5 Accuracy: 85.99
+ Top 1 Accuracy: 63.70
+ Top 5 Accuracy: 88.28
Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20221122-cb2cc64e.pth
+ Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb_20230320-efcc0d1b.pth
diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py
new file mode 100644
index 0000000000..32c276647f
--- /dev/null
+++ b/configs/recognition/tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py
@@ -0,0 +1,125 @@
+_base_ = [
+ '../../_base_/models/tsm_mobilenet_v2.py',
+ '../../_base_/default_runtime.py'
+]
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+
+train_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(
+ type='MultiScaleCrop',
+ input_size=224,
+ scales=(1, 0.875, 0.75, 0.66),
+ random_crop=False,
+ max_wh_scale_gap=1,
+ num_fixed_crops=13),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCHW'),
+ dict(type='PackActionInputs')
+]
+val_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(
+ type='SampleFrames',
+ clip_len=1,
+ frame_interval=1,
+ num_clips=8,
+ test_mode=True),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(type='CenterCrop', crop_size=224),
+ dict(type='FormatShape', input_format='NCHW'),
+ dict(type='PackActionInputs')
+]
+test_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(
+ type='SampleFrames',
+ clip_len=1,
+ frame_interval=1,
+ num_clips=8,
+ test_mode=True),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(type='TenCrop', crop_size=224),
+ dict(type='FormatShape', input_format='NCHW'),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ data_prefix=dict(video=data_root),
+ pipeline=train_pipeline))
+val_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ data_prefix=dict(video=data_root_val),
+ pipeline=val_pipeline,
+ test_mode=True))
+test_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ data_prefix=dict(video=data_root_val),
+ pipeline=test_pipeline,
+ test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+default_hooks = dict(checkpoint=dict(interval=3, max_keep_ckpts=3))
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=50, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(type='LinearLR', start_factor=0.1, by_epoch=True, begin=0, end=5),
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=50,
+ by_epoch=True,
+ milestones=[25, 45],
+ gamma=0.1)
+]
+
+optim_wrapper = dict(
+ constructor='TSMOptimWrapperConstructor',
+ paramwise_cfg=dict(fc_lr5=True),
+ optimizer=dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001),
+ clip_grad=dict(max_norm=20, norm_type=2))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py
index 9429730700..7cb4b48ac7 100644
--- a/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py
+++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r101_8xb16-1x1x8-50e_sthv2-rgb.py
@@ -1,6 +1,6 @@
_base_ = ['tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py']
# model settings
-r101_checkpoint = 'https://download.pytorch.org/models/resnet101-cd907fc2.pth'
+r101_checkpoint = 'torchvision://resnet101'
model = dict(backbone=dict(pretrained=r101_checkpoint, depth=101))
diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py
index 691e39c2b2..36b1eefcf0 100644
--- a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py
+++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x16-50e_sthv2-rgb.py
@@ -4,6 +4,7 @@
file_client_args = dict(io_backend='disk')
+sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
train_pipeline = [
dict(type='DecordInit', **file_client_args),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16),
@@ -17,7 +18,7 @@
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
- dict(type='Flip', flip_ratio=0.5),
+ dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map),
dict(type='FormatShape', input_format='NCHW'),
dict(type='PackActionInputs')
]
@@ -46,7 +47,7 @@
test_mode=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
- dict(type='TenCrop', crop_size=224),
+ dict(type='ThreeCrop', crop_size=256),
dict(type='FormatShape', input_format='NCHW'),
dict(type='PackActionInputs')
]
diff --git a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py
index ba9c393593..8248bcb02b 100644
--- a/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py
+++ b/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py
@@ -11,6 +11,7 @@
file_client_args = dict(io_backend='disk')
+sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
train_pipeline = [
dict(type='DecordInit', **file_client_args),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
@@ -24,7 +25,7 @@
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
- dict(type='Flip', flip_ratio=0.5),
+ dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map),
dict(type='FormatShape', input_format='NCHW'),
dict(type='PackActionInputs')
]
@@ -53,7 +54,7 @@
twice_sample=True),
dict(type='DecordDecode'),
dict(type='Resize', scale=(-1, 256)),
- dict(type='TenCrop', crop_size=224),
+ dict(type='ThreeCrop', crop_size=256),
dict(type='FormatShape', input_format='NCHW'),
dict(type='PackActionInputs')
]
diff --git a/configs/recognition/tsn/README.md b/configs/recognition/tsn/README.md
index d34d1ab433..61a65ace30 100644
--- a/configs/recognition/tsn/README.md
+++ b/configs/recognition/tsn/README.md
@@ -32,8 +32,8 @@ Deep convolutional networks have achieved great success for visual recognition i
| frame sampling strategy | scheduler | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | FLOPs | params | config | ckpt | log |
| :---------------------: | :-------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :----: | :----: | :------------------------------: | -----------------------------: | -----------------------------: |
-| 1x1x8 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 34.85 | 66.37 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20221122-ad2dbb37.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log) |
-| 1x1x16 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 36.55 | 68.00 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20221122-ee13c8e2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log) |
+| 1x1x8 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 35.51 | 67.09 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20230313-06ad7d03.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log) |
+| 1x1x16 | MultiStep | 224x224 | 8 | ResNet50 | ImageNet | 36.91 | 68.77 | 25 clips x 10 crop | 102.7G | 24.33M | [config](/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20230221-85bcc1c3.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log) |
### Using backbones from 3rd-party in TSN
@@ -49,7 +49,7 @@ It's possible and convenient to use a 3rd-party backbone for TSN under the frame
| 1x1x3 | MultiStep | 224x224 | 8 | DenseNet161 | ImageNet | 72.07 | 90.15 | 25 clips x 10 crop | 194.6G | 27.36M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb_20220906-5f4c0daf.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-dense161_8xb32-1x1x3-100e_kinetics400-rgb.log) |
| 1x1x3 | MultiStep | 224x224 | 8 | Swin Transformer | ImageNet | 77.03 | 92.61 | 25 clips x 10 crop | 386.7G | 87.15M | [config](/configs/recognition/tsn/custom_backbones/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb_20220906-65ed814e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-swin-transformer_8xb32-1x1x3-100e_kinetics400-rgb.log) |
-1. Note that some backbones in TIMM are not supported due to multiple reasons. Please refer to to [PR #880](https://github.com/open-mmlab/mmaction2/pull/880) for details.
+1. Note that some backbones in TIMM are not supported due to multiple reasons. Please refer to [PR #880](https://github.com/open-mmlab/mmaction2/pull/880) for details.
2. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
3. The validation set of Kinetics400 we used consists of 19796 videos. These videos are available at [Kinetics400-Validation](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EbXw2WX94J1Hunyt3MWNDJUBz-nHvQYhO9pvKqm6g39PMA?e=a9QldB). The corresponding [data list](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_val_list.txt) (each line is of the format 'video_id, num_frames, label_index') and the [label map](https://download.openmmlab.com/mmaction/dataset/k400_val/kinetics_class2ind.txt) are also available.
@@ -73,7 +73,7 @@ python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-
--seed=0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -90,7 +90,7 @@ python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/tsn/metafile.yml b/configs/recognition/tsn/metafile.yml
index e618ed71cc..37943e673b 100644
--- a/configs/recognition/tsn/metafile.yml
+++ b/configs/recognition/tsn/metafile.yml
@@ -210,10 +210,10 @@ Models:
- Dataset: Kinetics-400
Task: Action Recognition
Metrics:
- Top 1 Accuracy: 34.85
- Top 5 Accuracy: 66.37
+ Top 1 Accuracy: 35.51
+ Top 5 Accuracy: 67.09
Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20221122-ad2dbb37.pth
+ Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb_20230313-06ad7d03.pth
- Name: tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb
Config: configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py
@@ -233,7 +233,7 @@ Models:
- Dataset: Kinetics-400
Task: Action Recognition
Metrics:
- Top 1 Accuracy: 36.55
- Top 5 Accuracy: 68.00
+ Top 1 Accuracy: 36.91
+ Top 5 Accuracy: 68.77
Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.log
- Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20221122-ee13c8e2.pth
+ Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb_20230221-85bcc1c3.pth
diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py
index 5797a6f596..15fde3ba79 100644
--- a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py
+++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb.py
@@ -2,6 +2,7 @@
file_client_args = dict(io_backend='disk')
+sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
train_pipeline = [
dict(type='DecordInit', **file_client_args),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16),
@@ -15,7 +16,7 @@
max_wh_scale_gap=1,
num_fixed_crops=13),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
- dict(type='Flip', flip_ratio=0.5),
+ dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map),
dict(type='FormatShape', input_format='NCHW'),
dict(type='PackActionInputs')
]
diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
index 3bea4b9ca7..d48b403c02 100644
--- a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
@@ -58,7 +58,7 @@
]
train_dataloader = dict(
- batch_size=32,
+ batch_size=4,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
@@ -99,4 +99,15 @@
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (8 GPUs) x (32 samples per GPU).
-auto_scale_lr = dict(enable=False, base_batch_size=256)
+auto_scale_lr = dict(enable=True, base_batch_size=256)
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=3)
+param_scheduler = [
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=10,
+ by_epoch=True,
+ milestones=[4, 8],
+ gamma=0.1)
+]
diff --git a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py
index 39113ba5b3..a94f7b3b22 100644
--- a/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py
+++ b/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-50e_sthv2-rgb.py
@@ -14,6 +14,7 @@
file_client_args = dict(io_backend='disk')
+sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
train_pipeline = [
dict(type='DecordInit', **file_client_args),
dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
@@ -26,7 +27,7 @@
random_crop=False,
max_wh_scale_gap=1),
dict(type='Resize', scale=(224, 224), keep_ratio=False),
- dict(type='Flip', flip_ratio=0.5),
+ dict(type='Flip', flip_ratio=0.5, flip_label_map=sthv2_flip_label_map),
dict(type='FormatShape', input_format='NCHW'),
dict(type='PackActionInputs')
]
diff --git a/configs/recognition/uniformer/README.md b/configs/recognition/uniformer/README.md
index 65c224ecc3..ff19fb4fb9 100644
--- a/configs/recognition/uniformer/README.md
+++ b/configs/recognition/uniformer/README.md
@@ -51,7 +51,7 @@ python tools/test.py configs/recognition/uniformer/uniformer-small_imagenet1k-pr
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/uniformerv2/README.md b/configs/recognition/uniformerv2/README.md
index c69b69a662..d6e57c7bf9 100644
--- a/configs/recognition/uniformerv2/README.md
+++ b/configs/recognition/uniformerv2/README.md
@@ -20,51 +20,53 @@ Learning discriminative spatiotemporal representation is the key problem of vide
### Kinetics-400
-| uniform sampling | resolution | backbone | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt |
-| :--------------: | :------------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| 8 | short-side 320 | UniFormerV2-B/16 | 85.8 | 97.1 | 85.6 | 97.0 | 85.8 | 97.1 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth) |
-| 8 | short-side 320 | UniFormerV2-L/14 | 88.7 | 98.1 | 88.8 | 98.1 | 88.7 | 98.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth) |
-| 16 | short-side 320 | UniFormerV2-L/14 | 89.0 | 98.2 | 89.1 | 98.2 | 89.0 | 98.2 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth) |
-| 32 | short-side 320 | UniFormerV2-L/14 | 89.3 | 98.2 | 89.3 | 98.2 | 89.4 | 98.2 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth) |
-| 32 | short-side 320 | UniFormerV2-L/14@336 | 89.5 | 98.4 | 89.7 | 98.3 | 89.5 | 98.4 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth) |
+| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | log |
+| :--------------: | :------------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| 8 | short-side 320 | UniFormerV2-B/16 | clip | 84.3 | 96.4 | 84.4 | 96.3 | - | - | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log) |
+| 8 | short-side 320 | UniFormerV2-B/16 | clip-kinetics710 | 85.8 | 97.1 | 85.6 | 97.0 | - | - | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log) |
+| 8 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 88.7 | 98.1 | 88.8 | 98.1 | 88.7 | 98.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-972ea063.pth) | - |
+| 16 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 89.0 | 98.2 | 89.1 | 98.2 | 89.0 | 98.2 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics400-rgb_20221219-6dc86d05.pth) | - |
+| 32 | short-side 320 | UniFormerV2-L/14\* | clip-kinetics710 | 89.3 | 98.2 | 89.3 | 98.2 | 89.4 | 98.2 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth) | - |
+| 32 | short-side 320 | UniFormerV2-L/14@336\* | clip-kinetics710 | 89.5 | 98.4 | 89.7 | 98.3 | 89.5 | 98.4 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth) | - |
### Kinetics-600
-| uniform sampling | resolution | backbone | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt |
-| :--------------: | :--------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| 8 | Raw | UniFormerV2-B/16 | 86.4 | 97.3 | 86.1 | 97.2 | 85.5 | 97.0 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-c62c4da4.pth) |
-| 8 | Raw | UniFormerV2-L/14 | 89.0 | 98.3 | 89.0 | 98.2 | 87.5 | 98.0 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth) |
-| 16 | Raw | UniFormerV2-L/14 | 89.4 | 98.3 | 89.4 | 98.3 | 87.8 | 98.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth) |
-| 32 | Raw | UniFormerV2-L/14 | 89.2 | 98.3 | 89.5 | 98.3 | 87.7 | 98.1 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth) |
-| 32 | Raw | UniFormerV2-L/14@336 | 89.8 | 98.5 | 89.9 | 98.5 | 88.8 | 98.3 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth) |
+| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | log |
+| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710 | 86.4 | 97.3 | 86.1 | 97.2 | - | - | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log) |
+| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.0 | 98.3 | 89.0 | 98.2 | 87.5 | 98.0 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-cf88e4c2.pth) | - |
+| 16 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.4 | 98.3 | 89.4 | 98.3 | 87.8 | 98.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics600-rgb_20221219-38ff0e3e.pth) | - |
+| 32 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 89.2 | 98.3 | 89.5 | 98.3 | 87.7 | 98.1 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-d450d071.pth) | - |
+| 32 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710 | 89.8 | 98.5 | 89.9 | 98.5 | 88.8 | 98.3 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth) | - |
### Kinetics-700
-| uniform sampling | resolution | backbone | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt |
-| :--------------: | :--------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :-----------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| 8 | Raw | UniFormerV2-B/16 | 76.3 | 92.9 | 76.3 | 92.7 | 75.1 | 92.5 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-8a7c4ac4.pth) |
-| 8 | Raw | UniFormerV2-L/14 | 80.8 | 95.2 | 80.8 | 95.4 | 79.4 | 94.8 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth) |
-| 16 | Raw | UniFormerV2-L/14 | 81.2 | 95.6 | 81.2 | 95.6 | 79.2 | 95.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth) |
-| 32 | Raw | UniFormerV2-L/14 | 81.4 | 95.7 | 81.5 | 95.7 | 79.8 | 95.3 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth) |
-| 32 | Raw | UniFormerV2-L/14@336 | 82.1 | 96.0 | 82.1 | 96.1 | 80.6 | 95.6 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth) |
+| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | mm-Kinetics top1 acc | mm-Kinetics top5 acc | testing protocol | FLOPs | params | config | ckpt | log |
+| :--------------: | :--------: | :--------------------: | :--------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :------------------: | :------------------: | :--------------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| 8 | Raw | UniFormerV2-B/16 | clip | 75.9 | 92.9 | 75.8 | 92.8 | - | - | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log) |
+| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710 | 76.3 | 92.9 | 76.3 | 92.7 | - | - | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log) |
+| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 80.8 | 95.2 | 80.8 | 95.4 | 79.4 | 94.8 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-bfb9f401.pth) | - |
+| 16 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 81.2 | 95.6 | 81.2 | 95.6 | 79.2 | 95.0 | 4 clips x 3 crop | 1.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u16_kinetics700-rgb_20221219-745209d2.pth) | - |
+| 32 | Raw | UniFormerV2-L/14\* | clip-kinetics710 | 81.4 | 95.7 | 81.5 | 95.7 | 79.8 | 95.3 | 2 clips x 3 crop | 2.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-eebe7056.pth) | - |
+| 32 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710 | 82.1 | 96.0 | 82.1 | 96.1 | 80.6 | 95.6 | 2 clips x 3 crop | 6.3T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics700-rgb_20221219-95cf9046.pth) | - |
### MiTv1
-| uniform sampling | resolution | backbone | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | testing protocol | FLOPs | params | config | ckpt |
-| :--------------: | :--------: | :------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| 8 | Raw | UniFormerV2-B/16 | 42.7 | 71.6 | 42.6 | 71.7 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-fddbc786.pth) |
-| 8 | Raw | UniFormerV2-L/14 | 47.0 | 76.1 | 47.0 | 76.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth) |
-| 8 | Raw | UniFormerV2-L/14@336 | 47.7 | 76.8 | 47.8 | 76.0 | 4 clips x 3 crop | 1.6T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth) |
+| uniform sampling | resolution | backbone | pretrain | top1 acc | top5 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top1 acc | [reference](<(https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md)>) top5 acc | testing protocol | FLOPs | params | config | ckpt | log |
+| :--------------: | :--------: | :--------------------: | :--------------------------: | :------: | :------: | :---------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------: | :--------------: | :---: | :----: | :------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| 8 | Raw | UniFormerV2-B/16 | clip-kinetics710-kinetics400 | 42.3 | 71.5 | 42.6 | 71.7 | 4 clips x 3 crop | 0.1T | 115M | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log) |
+| 8 | Raw | UniFormerV2-L/14\* | clip-kinetics710-kinetics400 | 47.0 | 76.1 | 47.0 | 76.1 | 4 clips x 3 crop | 0.7T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-882c0598.pth) | - |
+| 8 | Raw | UniFormerV2-L/14@336\* | clip-kinetics710-kinetics400 | 47.7 | 76.8 | 47.8 | 76.0 | 4 clips x 3 crop | 1.6T | 354M | [config](/configs/recognition/uniformerv2/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-large-p16-res336_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-9020986e.pth) | - |
### Kinetics-710
-| uniform sampling | resolution | backbone | config | ckpt |
-| :--------------: | :--------: | :------------------: | :----------------------------------------------------------------------------: | :--------------------------------------------------------------------------: |
-| 8 | Raw | UniFormerV2-B/16 | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth) |
-| 8 | Raw | UniFormerV2-L/14 | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20221219-bfaae587.pth) |
-| 8 | Raw | UniFormerV2-L/14@336 | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20221219-55878cdc.pth) |
+| uniform sampling | resolution | backbone | pretrain | config | ckpt |
+| :--------------: | :--------: | :--------------------: | :------: | :-----------------------------------------------------------------------: | :---------------------------------------------------------------------: |
+| 8 | Raw | UniFormerV2-B/16\* | clip | [config](/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth) |
+| 8 | Raw | UniFormerV2-L/14\* | clip | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res224_clip-pre_u8_kinetics710-rgb_20221219-bfaae587.pth) |
+| 8 | Raw | UniFormerV2-L/14@336\* | clip | [config](/configs/recognition/uniformerv2/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20221219-55878cdc.pth) |
-The models are ported from the repo [UniFormerV2](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) and tested on our data. Currently, we only support the testing of UniFormerV2 models, training will be available soon.
+The models with * are ported from the repo [UniFormerV2](https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md) and tested on our data. Due to computational limitations, we only support reliable training config for base model (i.e. UniFormerV2-B/16).
1. The values in columns named after "reference" are the results of the original repo.
2. The values in `top1/5 acc` is tested on the same data list as the original repo, and the label map is provided by [UniFormerV2](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL).
@@ -93,7 +95,7 @@ python tools/test.py configs/recognition/uniformerv2/uniformerv2-base-p16-res224
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt b/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt
new file mode 100644
index 0000000000..150f3447b4
--- /dev/null
+++ b/configs/recognition/uniformerv2/k710_channel_map/label_map_k710.txt
@@ -0,0 +1,710 @@
+riding a bike 0
+marching 1
+dodgeball 2
+playing cymbals 3
+checking tires 4
+roller skating 5
+tasting beer 6
+clapping 7
+drawing 8
+juggling fire 9
+bobsledding 10
+petting animal (not cat) 11
+spray painting 12
+training dog 13
+eating watermelon 14
+building cabinet 15
+applauding 16
+playing harp 17
+inflating balloons 18
+sled dog racing 19
+wrestling 20
+pole vault 21
+hurling (sport) 22
+riding scooter 23
+shearing sheep 24
+sweeping floor 25
+eating carrots 26
+skateboarding 27
+dunking basketball 28
+disc golfing 29
+eating spaghetti 30
+playing flute 31
+riding mechanical bull 32
+making sushi 33
+trapezing 34
+picking apples 35
+stretching leg 36
+playing ukulele 37
+tying necktie 38
+skydiving 39
+playing cello 40
+jumping into pool 41
+shooting goal (soccer) 42
+trimming trees 43
+bookbinding 44
+ski jumping 45
+walking the dog 46
+riding unicycle 47
+shaving head 48
+hopscotch 49
+playing piano 50
+parasailing 51
+bartending 52
+kicking field goal 53
+finger snapping 54
+dining 55
+yawning 56
+peeling potatoes 57
+canoeing or kayaking 58
+front raises 59
+laughing 60
+dancing macarena 61
+digging 62
+reading newspaper 63
+hitting baseball 64
+clay pottery making 65
+exercising with an exercise ball 66
+playing saxophone 67
+shooting basketball 68
+washing hair 69
+lunge 70
+brushing hair 71
+curling hair 72
+kitesurfing 73
+tapping guitar 74
+bending back 75
+skipping rope 76
+situp 77
+folding paper 78
+cracking neck 79
+assembling computer 80
+cleaning gutters 81
+blowing out candles 82
+shaking hands 83
+dancing gangnam style 84
+windsurfing 85
+tap dancing 86
+skiing mono 87
+bandaging 88
+push up 89
+doing nails 90
+punching person (boxing) 91
+bouncing on trampoline 92
+scrambling eggs 93
+singing 94
+cleaning floor 95
+krumping 96
+drumming fingers 97
+snowmobiling 98
+gymnastics tumbling 99
+headbanging 100
+catching or throwing frisbee 101
+riding elephant 102
+bee keeping 103
+feeding birds 104
+snatch weight lifting 105
+mowing lawn 106
+fixing hair 107
+playing trumpet 108
+flying kite 109
+crossing river 110
+swinging legs 111
+sanding floor 112
+belly dancing 113
+sneezing 114
+clean and jerk 115
+side kick 116
+filling eyebrows 117
+shuffling cards 118
+recording music 119
+cartwheeling 120
+feeding fish 121
+folding clothes 122
+water skiing 123
+tobogganing 124
+blowing leaves 125
+smoking 126
+unboxing 127
+tai chi 128
+waxing legs 129
+riding camel 130
+slapping 131
+tossing salad 132
+capoeira 133
+playing cards 134
+playing organ 135
+playing violin 136
+playing drums 137
+tapping pen 138
+vault 139
+shoveling snow 140
+playing tennis 141
+getting a tattoo 142
+making a sandwich 143
+making tea 144
+grinding meat 145
+squat 146
+eating doughnuts 147
+ice fishing 148
+snowkiting 149
+kicking soccer ball 150
+playing controller 151
+giving or receiving award 152
+welding 153
+throwing discus 154
+throwing axe 155
+ripping paper 156
+swimming butterfly stroke 157
+air drumming 158
+blowing nose 159
+hockey stop 160
+taking a shower 161
+bench pressing 162
+planting trees 163
+pumping fist 164
+climbing tree 165
+tickling 166
+high kick 167
+waiting in line 168
+slacklining 169
+tango dancing 170
+hurdling 171
+carrying baby 172
+celebrating 173
+sharpening knives 174
+passing American football (in game) 175
+headbutting 176
+playing recorder 177
+brush painting 178
+person collecting garbage 179
+robot dancing 180
+shredding paper 181
+pumping gas 182
+rock climbing 183
+hula hooping 184
+braiding hair 185
+opening present 186
+texting 187
+decorating the christmas tree 188
+answering questions 189
+playing keyboard 190
+writing 191
+bungee jumping 192
+smelling feet 193
+eating burger 194
+playing accordion 195
+making pizza 196
+playing volleyball 197
+tasting food 198
+pushing cart 199
+spinning poi 200
+cleaning windows 201
+arm wrestling 202
+changing oil 203
+swimming breast stroke 204
+tossing coin 205
+deadlifting 206
+hoverboarding 207
+cutting watermelon 208
+cheerleading 209
+snorkeling 210
+washing hands 211
+eating cake 212
+pull ups 213
+surfing water 214
+eating hotdog 215
+holding snake 216
+playing harmonica 217
+ironing 218
+cutting nails 219
+golf chipping 220
+shot put 221
+hugging (not baby) 222
+playing clarinet 223
+faceplanting 224
+trimming or shaving beard 225
+drinking shots 226
+riding mountain bike 227
+tying bow tie 228
+swinging on something 229
+skiing crosscountry 230
+unloading truck 231
+cleaning pool 232
+jogging 233
+ice climbing 234
+mopping floor 235
+making the bed 236
+diving cliff 237
+washing dishes 238
+grooming dog 239
+weaving basket 240
+frying vegetables 241
+stomping grapes 242
+moving furniture 243
+cooking sausages (not on barbeque) 244
+doing laundry 245
+dyeing hair 246
+knitting 247
+reading book 248
+baby waking up 249
+punching bag 250
+surfing crowd 251
+cooking chicken 252
+pushing car 253
+springboard diving 254
+swing dancing 255
+massaging legs 256
+beatboxing 257
+breading or breadcrumbing 258
+somersaulting 259
+brushing teeth 260
+stretching arm 261
+juggling balls 262
+massaging person's head 263
+eating ice cream 264
+extinguishing fire 265
+hammer throw 266
+whistling 267
+crawling baby 268
+using remote controller (not gaming) 269
+playing cricket 270
+opening bottle (not wine) 271
+playing xylophone 272
+motorcycling 273
+driving car 274
+exercising arm 275
+passing American football (not in game) 276
+playing kickball 277
+sticking tongue out 278
+flipping pancake 279
+catching fish 280
+eating chips 281
+shaking head 282
+sword fighting 283
+playing poker 284
+cooking on campfire 285
+doing aerobics 286
+paragliding 287
+using segway 288
+folding napkins 289
+playing bagpipes 290
+gargling 291
+skiing slalom 292
+strumming guitar 293
+javelin throw 294
+waxing back 295
+riding or walking with horse 296
+plastering 297
+long jump 298
+parkour 299
+wrapping present 300
+egg hunting 301
+archery 302
+cleaning toilet 303
+swimming backstroke 304
+snowboarding 305
+catching or throwing baseball 306
+massaging back 307
+blowing glass 308
+playing guitar 309
+playing chess 310
+golf driving 311
+presenting weather forecast 312
+rock scissors paper 313
+high jump 314
+baking cookies 315
+using computer 316
+washing feet 317
+arranging flowers 318
+playing bass guitar 319
+spraying 320
+cutting pineapple 321
+waxing chest 322
+auctioning 323
+jetskiing 324
+sipping cup 325
+busking 326
+playing monopoly 327
+salsa dancing 328
+waxing eyebrows 329
+watering plants 330
+zumba 331
+chopping wood 332
+pushing wheelchair 333
+carving pumpkin 334
+building shed 335
+making jewelry 336
+catching or throwing softball 337
+bending metal 338
+ice skating 339
+dancing charleston 340
+abseiling 341
+climbing a rope 342
+crying 343
+cleaning shoes 344
+dancing ballet 345
+driving tractor 346
+triple jump 347
+throwing ball 348
+getting a haircut 349
+running on treadmill 350
+climbing ladder 351
+blasting sand 352
+playing trombone 353
+drop kicking 354
+country line dancing 355
+changing wheel (not on bike) 356
+feeding goats 357
+tying knot (not on a tie) 358
+setting table 359
+shaving legs 360
+kissing 361
+riding mule 362
+counting money 363
+laying bricks 364
+barbequing 365
+news anchoring 366
+smoking hookah 367
+cooking egg 368
+peeling apples 369
+yoga 370
+sharpening pencil 371
+dribbling basketball 372
+petting cat 373
+playing ice hockey 374
+milking cow 375
+shining shoes 376
+juggling soccer ball 377
+scuba diving 378
+playing squash or racquetball 379
+drinking beer 380
+sign language interpreting 381
+playing basketball 382
+breakdancing 383
+testifying 384
+making snowman 385
+golf putting 386
+playing didgeridoo 387
+biking through snow 388
+sailing 389
+jumpstyle dancing 390
+water sliding 391
+grooming horse 392
+massaging feet 393
+playing paintball 394
+making a cake 395
+bowling 396
+contact juggling 397
+applying cream 398
+playing badminton 399
+poaching eggs 400
+playing nose flute 401
+entering church 402
+closing door 403
+helmet diving 404
+doing sudoku 405
+coughing 406
+seasoning food 407
+peeling banana 408
+eating nachos 409
+waxing armpits 410
+shouting 411
+silent disco 412
+polishing furniture 413
+taking photo 414
+dealing cards 415
+putting wallpaper on wall 416
+uncorking champagne 417
+curling eyelashes 418
+brushing floor 419
+pulling espresso shot 420
+playing american football 421
+grooming cat 422
+playing checkers 423
+moving child 424
+stacking cups 425
+squeezing orange 426
+opening coconuts 427
+rolling eyes 428
+picking blueberries 429
+playing road hockey 430
+carving wood with a knife 431
+slicing onion 432
+saluting 433
+letting go of balloon 434
+breaking glass 435
+carrying weight 436
+mixing colours 437
+moving baby 438
+blending fruit 439
+pouring milk 440
+surveying 441
+making slime 442
+sieving 443
+walking with crutches 444
+flipping bottle 445
+playing billiards 446
+arresting 447
+listening with headphones 448
+spinning plates 449
+carving marble 450
+cutting cake 451
+shoot dance 452
+being excited 453
+petting horse 454
+splashing water 455
+filling cake 456
+stacking dice 457
+checking watch 458
+treating wood 459
+laying decking 460
+shooting off fireworks 461
+pouring wine 462
+pretending to be a statue 463
+steering car 464
+playing rounders 465
+looking in mirror 466
+jumping sofa 467
+lighting candle 468
+walking on stilts 469
+crocheting 470
+playing piccolo 471
+vacuuming car 472
+high fiving 473
+playing shuffleboard 474
+chasing 475
+pulling rope (game) 476
+being in zero gravity 477
+sanding wood 478
+decoupage 479
+using megaphone 480
+making latte art 481
+ski ballet 482
+playing oboe 483
+bouncing ball (not juggling) 484
+playing mahjong 485
+herding cattle 486
+swimming with sharks 487
+milking goat 488
+swimming with dolphins 489
+metal detecting 490
+playing slot machine 491
+polishing metal 492
+throwing tantrum 493
+lawn mower racing 494
+laying stone 495
+cutting orange 496
+skipping stone 497
+pouring beer 498
+making bubbles 499
+jaywalking 500
+leatherworking 501
+card stacking 502
+putting on eyeliner 503
+card throwing 504
+chewing gum 505
+falling off bike 506
+repairing puncture 507
+dumpster diving 508
+tiptoeing 509
+sleeping 510
+using circular saw 511
+cracking knuckles 512
+pinching 513
+chiseling wood 514
+playing rubiks cube 515
+weaving fabric 516
+fencing (sport) 517
+sword swallowing 518
+lighting fire 519
+vacuuming floor 520
+combing hair 521
+building lego 522
+playing pinball 523
+fly tying 524
+playing lute 525
+opening door 526
+waving hand 527
+rolling pastry 528
+chiseling stone 529
+threading needle 530
+playing dominoes 531
+opening wine bottle 532
+playing with trains 533
+steer roping 534
+playing field hockey 535
+separating eggs 536
+sewing 537
+talking on cell phone 538
+needle felting 539
+pushing wheelbarrow 540
+using a paint roller 541
+playing netball 542
+lifting hat 543
+massaging neck 544
+blowing bubble gum 545
+walking through snow 546
+docking boat 547
+clam digging 548
+marriage proposal 549
+packing 550
+sausage making 551
+licking 552
+scrapbooking 553
+flint knapping 554
+lock picking 555
+putting on lipstick 556
+sawing wood 557
+playing hand clapping games 558
+geocaching 559
+looking at phone 560
+making cheese 561
+poking bellybutton 562
+contorting 563
+fixing bicycle 564
+using a microscope 565
+using a wrench 566
+doing jigsaw puzzle 567
+making horseshoes 568
+cooking scallops 569
+square dancing 570
+getting a piercing 571
+playing ocarina 572
+making paper aeroplanes 573
+playing scrabble 574
+visiting the zoo 575
+crossing eyes 576
+jumping bicycle 577
+throwing water balloon 578
+bodysurfing 579
+pirouetting 580
+luge 581
+spelunking 582
+watching tv 583
+attending conference 584
+curling (sport) 585
+directing traffic 586
+swimming front crawl 587
+ice swimming 588
+battle rope training 589
+putting on mascara 590
+bouncing on bouncy castle 591
+smoking pipe 592
+pillow fight 593
+putting on sari 594
+calligraphy 595
+roasting pig 596
+cracking back 597
+shopping 598
+burping 599
+using bagging machine 600
+staring 601
+shucking oysters 602
+blowdrying hair 603
+smashing 604
+playing laser tag 605
+wading through mud 606
+rope pushdown 607
+preparing salad 608
+making balloon shapes 609
+tagging graffiti 610
+adjusting glasses 611
+using a power drill 612
+trimming shrubs 613
+popping balloons 614
+playing pan pipes 615
+using puppets 616
+arguing 617
+backflip (human) 618
+riding snow blower 619
+hand washing clothes 620
+calculating 621
+gospel singing in church 622
+standing on hands 623
+tasting wine 624
+shaping bread dough 625
+wading through water 626
+falling off chair 627
+throwing snowballs 628
+building sandcastle 629
+land sailing 630
+tying shoe laces 631
+jumping jacks 632
+wood burning (art) 633
+putting on foundation 634
+putting on shoes 635
+cumbia 636
+archaeological excavation 637
+mountain climber (exercise) 638
+assembling bicycle 639
+head stand 640
+cutting apple 641
+shuffling feet 642
+bottling 643
+breathing fire 644
+using inhaler 645
+historical reenactment 646
+hugging baby 647
+mushroom foraging 648
+delivering mail 649
+laying tiles 650
+using atm 651
+chopping meat 652
+tightrope walking 653
+mosh pit dancing 654
+photobombing 655
+coloring in 656
+huddling 657
+playing gong 658
+laying concrete 659
+breaking boards 660
+acting in play 661
+base jumping 662
+tie dying 663
+using a sledge hammer 664
+playing ping pong 665
+photocopying 666
+winking 667
+waking up 668
+swinging baseball bat 669
+twiddling fingers 670
+playing polo 671
+longboarding 672
+ironing hair 673
+bathing dog 674
+moon walking 675
+playing marbles 676
+embroidering 677
+playing beer pong 678
+home roasting coffee 679
+gold panning 680
+karaoke 681
+changing gear in car 682
+raising eyebrows 683
+yarn spinning 684
+scrubbing face 685
+fidgeting 686
+planing wood 687
+cosplaying 688
+capsizing 689
+tackling 690
+shining flashlight 691
+dyeing eyebrows 692
+drooling 693
+alligator wrestling 694
+playing blackjack 695
+carving ice 696
+playing maracas 697
+opening refrigerator 698
+throwing knife 699
+putting in contact lenses 700
+passing soccer ball 701
+casting fishing line 702
+sucking lolly 703
+installing carpet 704
+bulldozing 705
+roasting marshmallows 706
+playing darts 707
+chopping vegetables 708
+bull fighting 709
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k400.json b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json
new file mode 100644
index 0000000000..f97fa4d49f
--- /dev/null
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k400.json
@@ -0,0 +1 @@
+[341, 158, 189, 16, 398, 302, 202, 318, 80, 323, 249, 315, 18, 88, 365, 52, 257, 103, 113, 162, 75, 338, 388, 352, 308, 125, 159, 82, 10, 44, 92, 396, 185, 258, 383, 178, 71, 260, 15, 335, 192, 326, 58, 133, 172, 120, 334, 280, 306, 101, 337, 173, 203, 356, 4, 209, 332, 7, 65, 115, 95, 81, 232, 344, 303, 201, 342, 351, 165, 397, 252, 368, 285, 244, 363, 355, 79, 268, 110, 343, 72, 219, 321, 208, 345, 340, 84, 61, 206, 188, 62, 55, 29, 237, 2, 286, 245, 90, 8, 372, 325, 380, 226, 274, 346, 354, 97, 28, 246, 194, 212, 26, 281, 147, 215, 264, 30, 14, 301, 275, 66, 265, 224, 104, 121, 357, 117, 54, 107, 279, 109, 122, 289, 78, 59, 241, 179, 291, 349, 142, 152, 220, 311, 386, 145, 239, 392, 99, 266, 100, 176, 314, 167, 64, 160, 216, 49, 207, 222, 184, 171, 22, 234, 148, 339, 218, 294, 324, 233, 262, 9, 377, 41, 390, 53, 150, 361, 73, 247, 96, 60, 364, 298, 70, 395, 143, 236, 336, 196, 385, 33, 144, 1, 307, 393, 256, 263, 375, 235, 273, 243, 106, 366, 271, 186, 287, 51, 299, 175, 276, 369, 57, 11, 373, 35, 163, 297, 195, 399, 290, 382, 319, 134, 40, 310, 223, 151, 270, 3, 387, 137, 31, 309, 217, 17, 374, 190, 277, 327, 135, 394, 50, 284, 177, 67, 379, 141, 353, 108, 37, 136, 197, 272, 21, 312, 213, 164, 182, 250, 91, 89, 253, 199, 333, 248, 63, 119, 0, 130, 102, 32, 227, 362, 296, 23, 47, 156, 180, 183, 313, 5, 350, 389, 328, 112, 93, 378, 359, 83, 282, 174, 371, 48, 360, 24, 376, 68, 42, 221, 140, 181, 118, 116, 381, 94, 77, 27, 45, 87, 230, 292, 76, 39, 169, 131, 19, 126, 367, 105, 114, 193, 210, 305, 149, 98, 259, 200, 12, 320, 254, 146, 278, 242, 261, 36, 293, 251, 214, 25, 304, 204, 157, 255, 111, 229, 283, 128, 161, 170, 86, 74, 138, 6, 198, 384, 187, 155, 348, 154, 166, 124, 205, 132, 13, 34, 225, 43, 347, 228, 358, 38, 127, 231, 316, 269, 288, 139, 168, 46, 238, 317, 69, 211, 123, 391, 330, 295, 322, 329, 129, 240, 153, 267, 85, 300, 20, 191, 56, 370, 331]
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k600.json b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json
new file mode 100644
index 0000000000..f0d3b1b0e9
--- /dev/null
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k600.json
@@ -0,0 +1 @@
+[341, 661, 611, 158, 694, 189, 16, 398, 637, 302, 617, 202, 318, 639, 80, 584, 323, 618, 315, 88, 365, 52, 662, 674, 589, 257, 103, 113, 162, 75, 338, 388, 352, 603, 545, 308, 125, 159, 82, 10, 579, 44, 643, 591, 92, 396, 185, 258, 383, 660, 644, 178, 71, 260, 15, 522, 629, 335, 709, 705, 192, 599, 326, 621, 595, 58, 133, 689, 502, 504, 172, 120, 696, 334, 702, 280, 306, 101, 337, 173, 682, 203, 356, 4, 209, 505, 529, 514, 652, 708, 332, 548, 7, 65, 115, 81, 232, 344, 303, 201, 342, 351, 165, 656, 521, 397, 563, 368, 285, 244, 569, 688, 363, 355, 597, 512, 79, 268, 576, 110, 343, 636, 585, 72, 641, 219, 496, 321, 208, 345, 340, 84, 61, 206, 188, 649, 55, 586, 29, 237, 547, 2, 286, 567, 245, 90, 8, 372, 226, 274, 346, 693, 354, 97, 508, 28, 692, 246, 194, 212, 26, 281, 147, 215, 264, 30, 14, 301, 677, 66, 265, 224, 506, 627, 104, 121, 357, 517, 686, 54, 564, 107, 554, 279, 524, 109, 122, 289, 78, 59, 241, 559, 349, 571, 142, 152, 680, 220, 311, 386, 622, 145, 239, 392, 99, 266, 620, 640, 100, 176, 314, 167, 646, 64, 160, 216, 679, 49, 207, 657, 222, 647, 184, 171, 22, 234, 148, 339, 588, 18, 704, 218, 673, 294, 500, 324, 233, 262, 9, 377, 577, 41, 632, 390, 681, 53, 150, 361, 73, 247, 96, 630, 60, 494, 364, 659, 495, 650, 501, 552, 543, 519, 555, 298, 672, 560, 581, 70, 395, 143, 609, 499, 561, 568, 336, 573, 196, 385, 33, 144, 236, 1, 549, 307, 393, 256, 544, 263, 375, 675, 235, 654, 273, 638, 243, 106, 648, 539, 366, 271, 526, 186, 698, 532, 550, 287, 51, 299, 175, 276, 701, 369, 57, 179, 11, 373, 655, 666, 35, 593, 513, 580, 687, 163, 297, 195, 399, 290, 382, 319, 678, 695, 40, 310, 223, 151, 270, 3, 707, 387, 531, 137, 535, 31, 658, 309, 558, 217, 17, 374, 190, 277, 605, 525, 697, 676, 327, 542, 572, 135, 394, 615, 50, 523, 665, 284, 671, 177, 515, 67, 574, 379, 141, 353, 108, 37, 136, 197, 533, 272, 562, 21, 492, 614, 498, 608, 312, 213, 164, 182, 250, 91, 89, 253, 199, 540, 333, 700, 503, 634, 556, 590, 594, 635, 683, 248, 63, 119, 507, 0, 130, 102, 32, 362, 296, 23, 619, 47, 156, 706, 596, 180, 183, 313, 5, 528, 607, 350, 389, 328, 112, 551, 557, 93, 553, 685, 378, 536, 359, 537, 83, 282, 625, 174, 371, 48, 360, 24, 691, 376, 68, 42, 598, 221, 140, 602, 118, 642, 116, 381, 94, 325, 77, 27, 45, 230, 87, 292, 76, 497, 39, 169, 131, 19, 510, 604, 193, 126, 367, 592, 105, 114, 210, 305, 149, 98, 259, 582, 200, 12, 254, 570, 146, 623, 601, 534, 278, 242, 261, 36, 703, 251, 214, 25, 304, 204, 157, 587, 255, 669, 229, 283, 518, 690, 610, 128, 538, 170, 86, 74, 138, 6, 198, 624, 384, 187, 530, 155, 348, 154, 699, 628, 493, 578, 166, 663, 653, 509, 124, 205, 13, 34, 225, 613, 43, 347, 670, 228, 358, 38, 631, 127, 231, 565, 541, 612, 664, 566, 651, 600, 511, 645, 616, 269, 288, 520, 575, 606, 626, 168, 668, 46, 546, 238, 317, 69, 211, 583, 123, 391, 330, 527, 295, 322, 329, 129, 240, 516, 153, 267, 85, 667, 633, 300, 20, 191, 684, 56, 370, 331]
diff --git a/configs/recognition/uniformerv2/k710_channel_map/map_k700.json b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json
new file mode 100644
index 0000000000..784fa00f71
--- /dev/null
+++ b/configs/recognition/uniformerv2/k710_channel_map/map_k700.json
@@ -0,0 +1 @@
+[341, 661, 611, 158, 694, 189, 16, 398, 637, 302, 617, 202, 318, 447, 639, 80, 584, 323, 249, 618, 315, 88, 365, 52, 662, 674, 589, 257, 103, 453, 477, 113, 162, 75, 338, 388, 352, 439, 603, 545, 308, 125, 159, 82, 10, 579, 44, 643, 484, 591, 92, 396, 185, 258, 383, 660, 435, 644, 178, 419, 71, 260, 15, 522, 629, 335, 705, 192, 599, 326, 621, 595, 58, 133, 689, 502, 504, 172, 436, 120, 696, 450, 334, 431, 702, 280, 306, 101, 337, 173, 682, 203, 356, 475, 4, 458, 209, 505, 529, 514, 652, 332, 548, 7, 65, 115, 81, 232, 344, 303, 201, 342, 351, 165, 403, 656, 521, 397, 563, 252, 368, 285, 244, 569, 688, 406, 363, 355, 597, 512, 79, 268, 470, 576, 110, 343, 636, 585, 418, 72, 641, 451, 219, 496, 321, 208, 345, 340, 84, 61, 206, 415, 188, 479, 649, 62, 55, 586, 29, 237, 547, 2, 286, 567, 245, 90, 405, 8, 372, 226, 274, 346, 693, 354, 97, 508, 28, 692, 246, 194, 212, 26, 281, 147, 215, 264, 409, 30, 14, 301, 677, 402, 275, 66, 265, 224, 506, 627, 104, 121, 357, 517, 686, 456, 117, 54, 564, 107, 554, 445, 279, 524, 109, 122, 289, 78, 59, 241, 291, 559, 349, 571, 142, 152, 680, 220, 311, 386, 622, 145, 422, 239, 392, 99, 266, 620, 640, 100, 176, 404, 486, 473, 314, 167, 646, 64, 160, 216, 679, 49, 207, 657, 222, 647, 184, 171, 22, 234, 148, 339, 588, 18, 704, 218, 673, 294, 500, 324, 233, 262, 9, 377, 577, 41, 632, 467, 390, 681, 53, 150, 361, 73, 247, 96, 630, 60, 494, 364, 659, 460, 495, 650, 501, 434, 552, 543, 468, 519, 448, 555, 298, 672, 560, 466, 581, 70, 395, 143, 609, 499, 561, 568, 336, 481, 573, 196, 442, 385, 33, 144, 236, 1, 549, 307, 393, 256, 544, 263, 490, 375, 488, 437, 675, 235, 654, 273, 638, 438, 424, 243, 106, 648, 539, 366, 271, 427, 526, 186, 698, 532, 550, 287, 51, 299, 175, 276, 701, 369, 408, 57, 179, 11, 373, 454, 655, 666, 35, 429, 593, 513, 580, 687, 163, 297, 195, 421, 399, 290, 382, 319, 678, 446, 695, 134, 40, 423, 310, 223, 151, 270, 3, 707, 387, 531, 137, 535, 31, 658, 309, 558, 217, 17, 374, 190, 277, 605, 525, 485, 697, 676, 327, 542, 401, 483, 572, 135, 394, 615, 50, 471, 523, 665, 284, 671, 177, 430, 465, 515, 67, 574, 474, 491, 379, 141, 353, 108, 37, 136, 197, 533, 272, 400, 562, 21, 413, 492, 614, 498, 440, 462, 608, 312, 463, 213, 420, 476, 164, 182, 250, 91, 89, 253, 199, 540, 333, 700, 503, 634, 556, 590, 594, 635, 416, 683, 248, 63, 119, 507, 0, 130, 102, 32, 362, 296, 23, 619, 47, 156, 706, 596, 180, 183, 313, 5, 428, 528, 607, 350, 389, 328, 433, 112, 478, 551, 557, 93, 553, 685, 378, 407, 536, 359, 537, 83, 282, 625, 174, 371, 48, 360, 24, 691, 376, 452, 68, 42, 461, 598, 221, 411, 140, 181, 602, 118, 642, 116, 443, 381, 412, 94, 325, 77, 27, 482, 45, 230, 87, 292, 76, 497, 39, 169, 131, 19, 510, 432, 604, 193, 126, 367, 592, 105, 114, 210, 305, 149, 98, 259, 582, 449, 200, 455, 12, 320, 254, 570, 146, 426, 425, 457, 623, 601, 534, 464, 278, 242, 261, 36, 703, 251, 214, 441, 25, 304, 204, 157, 587, 489, 487, 255, 669, 229, 283, 518, 690, 610, 128, 414, 538, 170, 86, 74, 138, 6, 198, 624, 384, 187, 530, 155, 348, 154, 699, 628, 493, 578, 166, 663, 653, 509, 124, 205, 132, 13, 34, 459, 225, 613, 43, 347, 670, 228, 358, 38, 631, 127, 417, 231, 565, 541, 612, 664, 566, 651, 600, 511, 645, 480, 616, 269, 288, 472, 520, 575, 606, 626, 168, 668, 469, 46, 546, 444, 238, 317, 69, 211, 583, 123, 391, 330, 527, 410, 295, 322, 329, 129, 240, 516, 153, 267, 85, 667, 633, 300, 20, 191, 684, 56, 370, 331]
diff --git a/configs/recognition/uniformerv2/metafile.yml b/configs/recognition/uniformerv2/metafile.yml
index acd35d3443..bf99abe094 100644
--- a/configs/recognition/uniformerv2/metafile.yml
+++ b/configs/recognition/uniformerv2/metafile.yml
@@ -6,26 +6,49 @@ Collections:
Title: "UniFormerV2: Spatiotemporal Learning by Arming Image ViTs with Video UniFormer"
Models:
- - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb
- Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
+ - Name: uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb
+ Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.py
In Collection: UniFormer
Metadata:
Architecture: UniFormerV2-B/16
+ Batch Size: 32
+ Pretrained: CLIP-400M
+ Frame: 8
+ Sampling method: Uniform
+ Resolution: 224x224
+ Training Data: Kinetics-400
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Results:
+ - Dataset: Kinetics-400
+ Task: Action Recognition
+ Metrics:
+ Top 1 Accuracy: 84.3
+ Top 5 Accuracy: 96.4
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics400-rgb_20230313-e29fc968.pth
+
+ - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb
+ Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py
+ In Collection: UniFormer
+ Metadata:
+ Architecture: UniFormerV2-B/16
+ Batch Size: 32
Pretrained: Kinetics-710
- Resolution: short-side 320
Frame: 8
Sampling method: Uniform
+ Resolution: 224x224
+ Training Data: Kinetics-400
+ Training Resources: 8 GPUs
Modality: RGB
- Converted From:
- Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
- Code: https://github.com/OpenGVLab/UniFormerV2
Results:
- Dataset: Kinetics-400
Task: Action Recognition
Metrics:
Top 1 Accuracy: 85.8
Top 5 Accuracy: 97.1
- Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth
- Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb
Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
@@ -33,7 +56,7 @@ Models:
Metadata:
Architecture: UniFormerV2-L/14
Pretrained: Kinetics-710
- Resolution: short-side 320
+ Resolution: 224x224
Frame: 8
Sampling method: Uniform
Modality: RGB
@@ -54,7 +77,7 @@ Models:
Metadata:
Architecture: UniFormerV2-L/14
Pretrained: Kinetics-710
- Resolution: short-side 320
+ Resolution: 224x224
Frame: 16
Sampling method: Uniform
Modality: RGB
@@ -75,7 +98,7 @@ Models:
Metadata:
Architecture: UniFormerV2-L/14
Pretrained: Kinetics-710
- Resolution: short-side 320
+ Resolution: 224x224
Frame: 32
Sampling method: Uniform
Modality: RGB
@@ -96,7 +119,7 @@ Models:
Metadata:
Architecture: UniFormerV2-L/14@336
Pretrained: Kinetics-710
- Resolution: short-side 320
+ Resolution: 224x224
Frame: 32
Sampling method: Uniform
Modality: RGB
@@ -111,14 +134,15 @@ Models:
Top 5 Accuracy: 98.4
Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-1dd7650f.pth
- - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb
- Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
+ - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb
+ Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.py
In Collection: UniFormer
Metadata:
Architecture: UniFormerV2-B/16
Pretrained: Kinetics-710
Frame: 8
Sampling method: Uniform
+ Training Resources: 8 GPUs
Modality: RGB
Converted From:
Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
@@ -129,7 +153,8 @@ Models:
Metrics:
Top 1 Accuracy: 86.4
Top 5 Accuracy: 97.3
- Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb_20221219-c62c4da4.pth
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics600-rgb_20230313-544f06f0.pth
- Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb
Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
@@ -211,14 +236,15 @@ Models:
Top 5 Accuracy: 98.5
Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics600/uniformerv2-large-p14-res336_clip-kinetics710-pre_u32_kinetics600-rgb_20221219-f984f5d2.pth
- - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb
- Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
+ - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb
+ Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
In Collection: UniFormer
Metadata:
Architecture: UniFormerV2-B/16
- Pretrained: Kinetics-710
+ Pretrained: CLIP-400M
Frame: 8
Sampling method: Uniform
+ Training Resources: 8 GPUs
Modality: RGB
Converted From:
Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
@@ -229,7 +255,30 @@ Models:
Metrics:
Top 1 Accuracy: 76.3
Top 5 Accuracy: 92.9
- Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics700/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb_20221219-8a7c4ac4.pth
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip_8xb32-u8_kinetics700-rgb_20230313-f02e48ad.pth
+
+ - Name: uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb
+ Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.py
+ In Collection: UniFormer
+ Metadata:
+ Architecture: UniFormerV2-B/16
+ Pretrained: Kinetics-710
+ Frame: 8
+ Sampling method: Uniform
+ Training Resources: 8 GPUs
+ Modality: RGB
+ Converted From:
+ Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
+ Code: https://github.com/OpenGVLab/UniFormerV2
+ Results:
+ - Dataset: Kinetics-700
+ Task: Action Recognition
+ Metrics:
+ Top 1 Accuracy: 75.9
+ Top 5 Accuracy: 92.9
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics700-rgb_20230313-69070837.pth
- Name: uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb
Config: configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
@@ -353,14 +402,15 @@ Models:
Code: https://github.com/OpenGVLab/UniFormerV2
Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-large-p14-res336_clip-pre_u8_kinetics710-rgb_20221219-55878cdc.pth
- - Name: uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb
- Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
+ - Name: uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb
+ Config: configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.py
In Collection: UniFormer
Metadata:
Architecture: UniFormerV2-B/16
Pretrained: Kinetics-710 + Kinetics-400
Frame: 8
Sampling method: Uniform
+ Training Resources: 16 GPUs
Modality: RGB
Converted From:
Weights: https://github.com/OpenGVLab/UniFormerV2/blob/main/MODEL_ZOO.md
@@ -369,9 +419,10 @@ Models:
- Dataset: Moments in Time V1
Task: Action Recognition
Metrics:
- Top 1 Accuracy: 42.7
- Top 5 Accuracy: 71.6
- Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/mitv1/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb_20221219-fddbc786.pth
+ Top 1 Accuracy: 42.3
+ Top 5 Accuracy: 71.5
+ Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb.log
+ Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_16xb32-u8_mitv1-rgb_20230313-a6f4a567.pth
- Name: uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb
Config: configs/recognition/uniformerv2/uniformerv2-large-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
index a4cae65831..a6e37c330a 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-kinetics-k400-pre_u8_mitv1-rgb.py
@@ -23,7 +23,13 @@
n_head=12,
mlp_factor=4.,
drop_path_rate=0.,
- mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+ mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+ clip_pretrained=False,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint= # noqa: E251
+ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb_20221219-203d6aac.pth', # noqa: E501
+ prefix='backbone.')),
cls_head=dict(
type='TimeSformerHead',
dropout_ratio=0.5,
@@ -38,11 +44,44 @@
# dataset settings
dataset_type = 'VideoDataset'
-data_root_val = 'data/mit_v1'
-ann_file_test = 'data/mit_v1/val.csv'
+data_root = 'data/mit/videos/training'
+data_root_val = 'data/mit/videos/validation'
+ann_file_train = 'data/mit/mit_train_list_videos.txt'
+ann_file_val = 'data/mit/mit_val_list_videos.txt'
+ann_file_test = 'data/mit/mit_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(
+ type='PytorchVideoWrapper',
+ op='RandAugment',
+ magnitude=7,
+ num_layers=4),
+ dict(type='RandomResizedCrop'),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(
+ type='UniformSample', clip_len=num_frames, num_clips=1,
+ test_mode=True),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 224)),
+ dict(type='CenterCrop', crop_size=224),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
test_pipeline = [
- dict(type='DecordInit'),
+ dict(type='DecordInit', **file_client_args),
dict(
type='UniformSample', clip_len=num_frames, num_clips=4,
test_mode=True),
@@ -53,8 +92,29 @@
dict(type='PackActionInputs')
]
+train_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ data_prefix=dict(video=data_root),
+ pipeline=train_pipeline))
+val_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ data_prefix=dict(video=data_root_val),
+ pipeline=val_pipeline,
+ test_mode=True))
test_dataloader = dict(
- batch_size=32,
+ batch_size=8,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=False),
@@ -63,8 +123,44 @@
ann_file=ann_file_test,
data_prefix=dict(video=data_root_val),
pipeline=test_pipeline,
- test_mode=True,
- delimiter=' '))
+ test_mode=True))
+val_evaluator = dict(type='AccMetric')
test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-5
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+ paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+ clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=1 / 20,
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ eta_min_ratio=1 / 20,
+ by_epoch=True,
+ begin=5,
+ end=24,
+ convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+ checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=512)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
index a3eddb0d04..4e47cabb84 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics400-rgb.py
@@ -23,13 +23,26 @@
n_head=12,
mlp_factor=4.,
drop_path_rate=0.,
- mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+ mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+ clip_pretrained=False,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint= # noqa: E251
+ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501
+ prefix='backbone.')),
cls_head=dict(
- type='TimeSformerHead',
+ type='UniFormerHead',
dropout_ratio=0.5,
num_classes=400,
in_channels=768,
- average_clips='prob'),
+ average_clips='prob',
+ channel_map= # noqa: E251
+ 'configs/recognition/uniformerv2/k710_channel_map/map_k400.json',
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint= # noqa: E251
+ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501
+ prefix='cls_head.')),
data_preprocessor=dict(
type='ActionDataPreprocessor',
mean=[114.75, 114.75, 114.75],
@@ -38,11 +51,44 @@
# dataset settings
dataset_type = 'VideoDataset'
-data_root_val = 'data/k400'
-ann_file_test = 'data/k400/val.csv'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(
+ type='PytorchVideoWrapper',
+ op='RandAugment',
+ magnitude=7,
+ num_layers=4),
+ dict(type='RandomResizedCrop'),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(
+ type='UniformSample', clip_len=num_frames, num_clips=1,
+ test_mode=True),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 224)),
+ dict(type='CenterCrop', crop_size=224),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
test_pipeline = [
- dict(type='DecordInit'),
+ dict(type='DecordInit', **file_client_args),
dict(
type='UniformSample', clip_len=num_frames, num_clips=4,
test_mode=True),
@@ -53,8 +99,29 @@
dict(type='PackActionInputs')
]
+train_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ data_prefix=dict(video=data_root),
+ pipeline=train_pipeline))
+val_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ data_prefix=dict(video=data_root_val),
+ pipeline=val_pipeline,
+ test_mode=True))
test_dataloader = dict(
- batch_size=32,
+ batch_size=8,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=False),
@@ -63,8 +130,45 @@
ann_file=ann_file_test,
data_prefix=dict(video=data_root_val),
pipeline=test_pipeline,
- test_mode=True,
- delimiter=','))
+ test_mode=True))
+val_evaluator = dict(type='AccMetric')
test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-6
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+ paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+ clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=0.5,
+ by_epoch=True,
+ begin=0,
+ end=1,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ T_max=4,
+ eta_min_ratio=0.5,
+ by_epoch=True,
+ begin=1,
+ end=5,
+ convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+ checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
index 4c91589dbb..a9f6f61413 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics600-rgb.py
@@ -23,13 +23,26 @@
n_head=12,
mlp_factor=4.,
drop_path_rate=0.,
- mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+ mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+ clip_pretrained=False,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint= # noqa: E251
+ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501
+ prefix='backbone.')),
cls_head=dict(
- type='TimeSformerHead',
+ type='UniFormerHead',
dropout_ratio=0.5,
num_classes=600,
in_channels=768,
- average_clips='prob'),
+ average_clips='prob',
+ channel_map= # noqa: E251
+ 'configs/recognition/uniformerv2/k710_channel_map/map_k600.json',
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint= # noqa: E251
+ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501
+ prefix='cls_head.')),
data_preprocessor=dict(
type='ActionDataPreprocessor',
mean=[114.75, 114.75, 114.75],
@@ -38,11 +51,44 @@
# dataset settings
dataset_type = 'VideoDataset'
-data_root_val = 'data/k600'
-ann_file_test = 'data/k600/val.csv'
+data_root = 'data/kinetics600/videos_train'
+data_root_val = 'data/kinetics600/videos_val'
+ann_file_train = 'data/kinetics600/kinetics600_train_list_videos.txt'
+ann_file_val = 'data/kinetics600/kinetics600_val_list_videos.txt'
+ann_file_test = 'data/kinetics600/kinetics600_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(
+ type='PytorchVideoWrapper',
+ op='RandAugment',
+ magnitude=7,
+ num_layers=4),
+ dict(type='RandomResizedCrop'),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(
+ type='UniformSample', clip_len=num_frames, num_clips=1,
+ test_mode=True),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 224)),
+ dict(type='CenterCrop', crop_size=224),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
test_pipeline = [
- dict(type='DecordInit'),
+ dict(type='DecordInit', **file_client_args),
dict(
type='UniformSample', clip_len=num_frames, num_clips=4,
test_mode=True),
@@ -53,8 +99,29 @@
dict(type='PackActionInputs')
]
+train_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ data_prefix=dict(video=data_root),
+ pipeline=train_pipeline))
+val_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ data_prefix=dict(video=data_root_val),
+ pipeline=val_pipeline,
+ test_mode=True))
test_dataloader = dict(
- batch_size=32,
+ batch_size=8,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=False),
@@ -63,8 +130,45 @@
ann_file=ann_file_test,
data_prefix=dict(video=data_root_val),
pipeline=test_pipeline,
- test_mode=True,
- delimiter=','))
+ test_mode=True))
+val_evaluator = dict(type='AccMetric')
test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-6
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+ paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+ clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=0.5,
+ by_epoch=True,
+ begin=0,
+ end=1,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ T_max=4,
+ eta_min_ratio=0.5,
+ by_epoch=True,
+ begin=1,
+ end=5,
+ convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+ checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
index 92494df5d7..5c59ad46f4 100644
--- a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_u8_kinetics700-rgb.py
@@ -23,13 +23,26 @@
n_head=12,
mlp_factor=4.,
drop_path_rate=0.,
- mlp_dropout=[0.5, 0.5, 0.5, 0.5]),
+ mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+ clip_pretrained=False,
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint= # noqa: E251
+ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501
+ prefix='backbone.')),
cls_head=dict(
- type='TimeSformerHead',
+ type='UniFormerHead',
dropout_ratio=0.5,
num_classes=700,
in_channels=768,
- average_clips='prob'),
+ average_clips='prob',
+ channel_map= # noqa: E251
+ 'configs/recognition/uniformerv2/k710_channel_map/map_k700.json',
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint= # noqa: E251
+ 'https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics710/uniformerv2-base-p16-res224_clip-pre_u8_kinetics710-rgb_20221219-77d34f81.pth', # noqa: E501
+ prefix='cls_head.')),
data_preprocessor=dict(
type='ActionDataPreprocessor',
mean=[114.75, 114.75, 114.75],
@@ -38,11 +51,44 @@
# dataset settings
dataset_type = 'VideoDataset'
-data_root_val = 'data/k700'
-ann_file_test = 'data/k700/val.csv'
+data_root = 'data/kinetics700/videos_train'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt'
+ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(
+ type='PytorchVideoWrapper',
+ op='RandAugment',
+ magnitude=7,
+ num_layers=4),
+ dict(type='RandomResizedCrop'),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(
+ type='UniformSample', clip_len=num_frames, num_clips=1,
+ test_mode=True),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 224)),
+ dict(type='CenterCrop', crop_size=224),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
test_pipeline = [
- dict(type='DecordInit'),
+ dict(type='DecordInit', **file_client_args),
dict(
type='UniformSample', clip_len=num_frames, num_clips=4,
test_mode=True),
@@ -53,8 +99,29 @@
dict(type='PackActionInputs')
]
+train_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ data_prefix=dict(video=data_root),
+ pipeline=train_pipeline))
+val_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ data_prefix=dict(video=data_root_val),
+ pipeline=val_pipeline,
+ test_mode=True))
test_dataloader = dict(
- batch_size=32,
+ batch_size=8,
num_workers=8,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=False),
@@ -63,8 +130,45 @@
ann_file=ann_file_test,
data_prefix=dict(video=data_root_val),
pipeline=test_pipeline,
- test_mode=True,
- delimiter=','))
+ test_mode=True))
+val_evaluator = dict(type='AccMetric')
test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=5, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
+
+base_lr = 2e-6
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+ paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+ clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=0.5,
+ by_epoch=True,
+ begin=0,
+ end=1,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ T_max=4,
+ eta_min_ratio=0.5,
+ by_epoch=True,
+ begin=1,
+ end=5,
+ convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+ checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics400-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics400-rgb.py
new file mode 100644
index 0000000000..6e9c4f3908
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics400-rgb.py
@@ -0,0 +1,163 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+ type='Recognizer3D',
+ backbone=dict(
+ type='UniFormerV2',
+ input_resolution=224,
+ patch_size=16,
+ width=768,
+ layers=12,
+ heads=12,
+ t_size=num_frames,
+ dw_reduction=1.5,
+ backbone_drop_path_rate=0.,
+ temporal_downsample=False,
+ no_lmhra=True,
+ double_lmhra=True,
+ return_list=[8, 9, 10, 11],
+ n_layers=4,
+ n_dim=768,
+ n_head=12,
+ mlp_factor=4.,
+ drop_path_rate=0.,
+ mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+ clip_pretrained=True,
+ pretrained='ViT-B/16'),
+ cls_head=dict(
+ type='UniFormerHead',
+ dropout_ratio=0.5,
+ num_classes=400,
+ in_channels=768,
+ average_clips='prob'),
+ data_preprocessor=dict(
+ type='ActionDataPreprocessor',
+ mean=[114.75, 114.75, 114.75],
+ std=[57.375, 57.375, 57.375],
+ format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_train'
+data_root_val = 'data/kinetics400/videos_val'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(
+ type='PytorchVideoWrapper',
+ op='RandAugment',
+ magnitude=7,
+ num_layers=4),
+ dict(type='RandomResizedCrop'),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(
+ type='UniformSample', clip_len=num_frames, num_clips=1,
+ test_mode=True),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 224)),
+ dict(type='CenterCrop', crop_size=224),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(
+ type='UniformSample', clip_len=num_frames, num_clips=4,
+ test_mode=True),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 224)),
+ dict(type='ThreeCrop', crop_size=224),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ data_prefix=dict(video=data_root),
+ pipeline=train_pipeline))
+val_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ data_prefix=dict(video=data_root_val),
+ pipeline=val_pipeline,
+ test_mode=True))
+test_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_test,
+ data_prefix=dict(video=data_root_val),
+ pipeline=test_pipeline,
+ test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1e-5
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+ paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+ clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=0.1,
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ T_max=50,
+ eta_min_ratio=0.1,
+ by_epoch=True,
+ begin=5,
+ end=55,
+ convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+ checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics700-rgb.py b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics700-rgb.py
new file mode 100644
index 0000000000..4a5b41d8c7
--- /dev/null
+++ b/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip_u8_kinetics700-rgb.py
@@ -0,0 +1,163 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+num_frames = 8
+model = dict(
+ type='Recognizer3D',
+ backbone=dict(
+ type='UniFormerV2',
+ input_resolution=224,
+ patch_size=16,
+ width=768,
+ layers=12,
+ heads=12,
+ t_size=num_frames,
+ dw_reduction=1.5,
+ backbone_drop_path_rate=0.,
+ temporal_downsample=False,
+ no_lmhra=True,
+ double_lmhra=True,
+ return_list=[8, 9, 10, 11],
+ n_layers=4,
+ n_dim=768,
+ n_head=12,
+ mlp_factor=4.,
+ drop_path_rate=0.,
+ mlp_dropout=[0.5, 0.5, 0.5, 0.5],
+ clip_pretrained=True,
+ pretrained='ViT-B/16'),
+ cls_head=dict(
+ type='UniFormerHead',
+ dropout_ratio=0.5,
+ num_classes=700,
+ in_channels=768,
+ average_clips='prob'),
+ data_preprocessor=dict(
+ type='ActionDataPreprocessor',
+ mean=[114.75, 114.75, 114.75],
+ std=[57.375, 57.375, 57.375],
+ format_shape='NCTHW'))
+
+# dataset settings
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics700/videos_train'
+data_root_val = 'data/kinetics700/videos_val'
+ann_file_train = 'data/kinetics700/kinetics700_train_list_videos.txt'
+ann_file_val = 'data/kinetics700/kinetics700_val_list_videos.txt'
+ann_file_test = 'data/kinetics700/kinetics700_val_list_videos.txt'
+
+file_client_args = dict(io_backend='disk')
+train_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(type='UniformSample', clip_len=num_frames, num_clips=1),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(
+ type='PytorchVideoWrapper',
+ op='RandAugment',
+ magnitude=7,
+ num_layers=4),
+ dict(type='RandomResizedCrop'),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(
+ type='UniformSample', clip_len=num_frames, num_clips=1,
+ test_mode=True),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 224)),
+ dict(type='CenterCrop', crop_size=224),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+ dict(type='DecordInit', **file_client_args),
+ dict(
+ type='UniformSample', clip_len=num_frames, num_clips=4,
+ test_mode=True),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 224)),
+ dict(type='ThreeCrop', crop_size=224),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ data_prefix=dict(video=data_root),
+ pipeline=train_pipeline))
+val_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ data_prefix=dict(video=data_root_val),
+ pipeline=val_pipeline,
+ test_mode=True))
+test_dataloader = dict(
+ batch_size=8,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file_test,
+ data_prefix=dict(video=data_root_val),
+ pipeline=test_pipeline,
+ test_mode=True))
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = dict(type='AccMetric')
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=55, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+base_lr = 1e-5
+optim_wrapper = dict(
+ optimizer=dict(
+ type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05),
+ paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0),
+ clip_grad=dict(max_norm=20, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='LinearLR',
+ start_factor=0.1,
+ by_epoch=True,
+ begin=0,
+ end=5,
+ convert_to_iter_based=True),
+ dict(
+ type='CosineAnnealingLR',
+ T_max=50,
+ eta_min_ratio=0.1,
+ by_epoch=True,
+ begin=5,
+ end=55,
+ convert_to_iter_based=True)
+]
+
+default_hooks = dict(
+ checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (8 samples per GPU).
+auto_scale_lr = dict(enable=True, base_batch_size=256)
diff --git a/configs/recognition/videomae/README.md b/configs/recognition/videomae/README.md
index 65b353aff1..16cffc4840 100644
--- a/configs/recognition/videomae/README.md
+++ b/configs/recognition/videomae/README.md
@@ -47,7 +47,7 @@ python tools/test.py configs/recognition/videomae/vit-base-p16_videomae-k400-pre
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition/x3d/README.md b/configs/recognition/x3d/README.md
index a0b9a6f3f4..88d4be33e5 100644
--- a/configs/recognition/x3d/README.md
+++ b/configs/recognition/x3d/README.md
@@ -47,7 +47,7 @@ python tools/test.py configs/recognition/x3d/x3d_s_13x6x1_facebook-kinetics400-r
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/recognition_audio/resnet/README.md b/configs/recognition_audio/resnet/README.md
index be036d149e..f74f5c6ccc 100644
--- a/configs/recognition_audio/resnet/README.md
+++ b/configs/recognition_audio/resnet/README.md
@@ -46,7 +46,7 @@ python tools/train.py configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100
--cfg-options randomness.seed=0 randomness.deterministic=True
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -63,7 +63,7 @@ python tools/test.py configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/skeleton/2s-agcn/README.md b/configs/skeleton/2s-agcn/README.md
index c61b6fe4e3..69ac0d5526 100644
--- a/configs/skeleton/2s-agcn/README.md
+++ b/configs/skeleton/2s-agcn/README.md
@@ -41,7 +41,7 @@ In skeleton-based action recognition, graph convolutional networks (GCNs), which
| | four-stream | | | 90.89 | | | | | | |
1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size.
-2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/user_guides/useful_tools.md#multi-stream-fusion).
+2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/advanced_guides/useful_tools.md#multi-stream-fusion).
## Train
@@ -58,7 +58,7 @@ python tools/train.py configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu6
--seed 0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -75,7 +75,7 @@ python tools/test.py configs/skeleton/2s-agcn/2s-agcn_8xb16-joint-u100-80e_ntu60
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/skeleton/posec3d/README.md b/configs/skeleton/posec3d/README.md
index 2fe5f579f0..93b526e5ac 100644
--- a/configs/skeleton/posec3d/README.md
+++ b/configs/skeleton/posec3d/README.md
@@ -54,29 +54,30 @@ Human skeleton, as a compact representation of human action, has received increa
### FineGYM
-| frame sampling strategy | pseudo heatmap | gpus | backbone | Mean Top-1 | testing protocol | config | ckpt | log |
-| :---------------------: | :------------: | :--: | :----------: | :--------: | :---------------: | :---------------------------------------: | :--------------------------------------: | :-------------------------------------: |
-| uniform 48 | keypoint | 8 | SlowOnly-R50 | 93.4 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint_20220815-da338c58.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint.log) |
-| uniform 48 | limb | 8 | SlowOnly-R50 | 93.7 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb_20220815-2e6e3c5c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb.log) |
+| frame sampling strategy | pseudo heatmap | gpus | backbone | Mean Top-1 | testing protocol | FLOPs | params | config | ckpt | log |
+| :---------------------: | :------------: | :--: | :----------: | :--------: | :--------------: | :---: | :----: | :------------------------------------: | :----------------------------------: | :----------------------------------: |
+| uniform 48 | keypoint | 8 | SlowOnly-R50 | 93.5 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint_20220815-da338c58.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint.log) |
+| uniform 48 | limb | 8 | SlowOnly-R50 | 93.6 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb_20220815-2e6e3c5c.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb.log) |
### NTU60_XSub
-| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | config | ckpt | log |
-| :---------------------: | :------------: | :--: | :----------: | :------: | :---------------: | :----------------------------------------: | :--------------------------------------: | :--------------------------------------: |
-| uniform 48 | keypoint | 8 | SlowOnly-R50 | 93.6 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint_20220815-38db104b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.log) |
-| uniform 48 | limb | 8 | SlowOnly-R50 | 93.5 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb_20220815-af2f119a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.log) |
+| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log |
+| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: |
+| uniform 48 | keypoint | 8 | SlowOnly-R50 | 93.6 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint_20220815-38db104b.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.log) |
+| uniform 48 | limb | 8 | SlowOnly-R50 | 93.5 | 10 clips | 20.6G | 2.0M | [config](/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb_20220815-af2f119a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.log) |
+| | Fusion | | | 94.0 | | | | | | |
### UCF101
-| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | config | ckpt | log |
-| :---------------------: | :------------: | :--: | :----------: | :------: | :---------------: | :----------------------------------------: | :--------------------------------------: | :--------------------------------------: |
-| uniform 48 | keypoint | 8 | SlowOnly-R50 | 86.9 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint_20220815-9972260d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.log) |
+| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log |
+| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: |
+| uniform 48 | keypoint | 8 | SlowOnly-R50 | 86.8 | 10 clips | 14.6G | 3.1M | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint_20220815-9972260d.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.log) |
### HMDB51
-| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | config | ckpt | log |
-| :---------------------: | :------------: | :--: | :----------: | :------: | :---------------: | :----------------------------------------: | :--------------------------------------: | :--------------------------------------: |
-| uniform 48 | keypoint | 8 | SlowOnly-R50 | 69.2 | 10 clips x 1 crop | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint_20220815-17eaa484.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.log) |
+| frame sampling strategy | pseudo heatmap | gpus | backbone | top1 acc | testing protocol | FLOPs | params | config | ckpt | log |
+| :---------------------: | :------------: | :--: | :----------: | :------: | :--------------: | :---: | :----: | :-------------------------------------: | :-----------------------------------: | :----------------------------------: |
+| uniform 48 | keypoint | 8 | SlowOnly-R50 | 69.6 | 10 clips | 14.6G | 3.0M | [config](/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint_20220815-17eaa484.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.log) |
1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
@@ -100,7 +101,7 @@ python tools/train.py configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-k
For training with your custom dataset, you can refer to [Custom Dataset Training](/configs/skeleton/posec3d/custom_dataset_training.md).
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -117,7 +118,7 @@ python tools/test.py configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-ke
checkpoints/SOME_CHECKPOINT.pth
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/skeleton/posec3d/custom_dataset_training.md b/configs/skeleton/posec3d/custom_dataset_training.md
index cb5b2f647f..81fc1cb3e1 100644
--- a/configs/skeleton/posec3d/custom_dataset_training.md
+++ b/configs/skeleton/posec3d/custom_dataset_training.md
@@ -2,7 +2,7 @@
We provide a step-by-step tutorial on how to train your custom dataset with PoseC3D.
-1. First, you should know that action recognition with PoseC3D requires skeleton information only and for that you need to prepare your custom annotation files (for training and validation). To start with, you need to replace the placeholder `mmdet_root` and `mmpose_root` in `ntu_pose_extraction.py` with your installation path. Then you need to take advantage of [ntu_pose_extraction.py](https://github.com/open-mmlab/mmaction2/blob/90fc8440961987b7fe3ee99109e2c633c4e30158/tools/data/skeleton/ntu_pose_extraction.py) as shown in [Prepare Annotations](https://github.com/open-mmlab/mmaction2/blob/master/tools/data/skeleton/README.md#prepare-annotations) to extract 2D keypoints for each video in your custom dataset. The command looks like (assuming the name of your video is `some_video_from_my_dataset.mp4`):
+1. First, you should know that action recognition with PoseC3D requires skeleton information only and for that you need to prepare your custom annotation files (for training and validation). To start with, you need to install MMDetection and MMPose. Then you need to take advantage of [ntu_pose_extraction.py](https://github.com/open-mmlab/mmaction2/blob/90fc8440961987b7fe3ee99109e2c633c4e30158/tools/data/skeleton/ntu_pose_extraction.py) as shown in [Prepare Annotations](https://github.com/open-mmlab/mmaction2/blob/master/tools/data/skeleton/README.md#prepare-annotations) to extract 2D keypoints for each video in your custom dataset. The command looks like (assuming the name of your video is `some_video_from_my_dataset.mp4`):
```shell
# You can use the above command to generate pickle files for all of your training and validation videos.
diff --git a/configs/skeleton/posec3d/metafile.yml b/configs/skeleton/posec3d/metafile.yml
index 7a3d3b9b20..b949a23d47 100644
--- a/configs/skeleton/posec3d/metafile.yml
+++ b/configs/skeleton/posec3d/metafile.yml
@@ -13,7 +13,8 @@ Models:
Architecture: SlowOnly-R50
Batch Size: 16
Epochs: 240
- Parameters: 2044867
+ FLOPs: 20.6G
+ Parameters: 2.0M
Training Data: FineGYM
Training Resources: 8 GPUs
pseudo heatmap: keypoint
@@ -21,7 +22,7 @@ Models:
- Dataset: FineGYM
Task: Skeleton-based Action Recognition
Metrics:
- mean Top 1 Accuracy: 93.4
+ mean Top 1 Accuracy: 93.5
Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint.log
Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint/slowonly_r50_8xb16-u48-240e_gym-keypoint_20220815-da338c58.pth
@@ -32,7 +33,8 @@ Models:
Architecture: SlowOnly-R50
Batch Size: 16
Epochs: 240
- Parameters: 2044867
+ FLOPs: 20.6G
+ Parameters: 2.0M
Training Data: FineGYM
Training Resources: 8 GPUs
pseudo heatmap: limb
@@ -40,7 +42,7 @@ Models:
- Dataset: FineGYM
Task: Skeleton-based Action Recognition
Metrics:
- mean Top 1 Accuracy: 93.7
+ mean Top 1 Accuracy: 93.6
Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb.log
Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb/slowonly_r50_8xb16-u48-240e_gym-limb_20220815-2e6e3c5c.pth
@@ -51,7 +53,8 @@ Models:
Architecture: SlowOnly-R50
Batch Size: 16
Epochs: 240
- Parameters: 2024860
+ FLOPs: 20.6G
+ Parameters: 2.0M
Training Data: NTU60-XSub
Training Resources: 8 GPUs
pseudo heatmap: keypoint
@@ -70,7 +73,8 @@ Models:
Architecture: SlowOnly-R50
Batch Size: 16
Epochs: 240
- Parameters: 2024860
+ FLOPs: 20.6G
+ Parameters: 2.0M
Training Data: NTU60-XSub
Training Resources: 8 GPUs
pseudo heatmap: limb
@@ -89,7 +93,8 @@ Models:
Architecture: SlowOnly-R50
Batch Size: 16
Epochs: 120
- Parameters: 3029984
+ FLOPs: 14.6G
+ Parameters: 3.0M
Training Data: HMDB51
Training Resources: 8 GPUs
pseudo heatmap: keypoint
@@ -97,7 +102,7 @@ Models:
- Dataset: HMDB51
Task: Skeleton-based Action Recognition
Metrics:
- Top 1 Accuracy: 69.2
+ Top 1 Accuracy: 69.6
Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.log
Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint_20220815-17eaa484.pth
@@ -108,7 +113,8 @@ Models:
Architecture: SlowOnly-R50
Batch Size: 16
Epochs: 120
- Parameters: 3055584
+ FLOPs: 14.6G
+ Parameters: 3.1M
Training Data: UCF101
Training Resources: 8 GPUs
pseudo heatmap: keypoint
@@ -116,6 +122,6 @@ Models:
- Dataset: UCF101
Task: Skeleton-based Action Recognition
Metrics:
- Top 1 Accuracy: 86.9
+ Top 1 Accuracy: 86.8
Training Log: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.log
Weights: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint_20220815-9972260d.pth
diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/README.md b/configs/skeleton/posec3d/rgbpose_conv3d/README.md
new file mode 100644
index 0000000000..37b4cd489d
--- /dev/null
+++ b/configs/skeleton/posec3d/rgbpose_conv3d/README.md
@@ -0,0 +1,107 @@
+# RGBPoseConv3D
+
+## Introduction
+
+RGBPoseConv3D is a framework that jointly use 2D human skeletons and RGB appearance for human action recognition. It is a 3D CNN with two streams, with the architecture borrowed from SlowFast. In RGBPoseConv3D:
+
+- The RGB stream corresponds to the `slow` stream in SlowFast; The Skeleton stream corresponds to the `fast` stream in SlowFast.
+- The input resolution of RGB frames is `4x` larger than the pseudo heatmaps.
+- Bilateral connections are used for early feature fusion between the two modalities.
+
+
+
+
+
+## Citation
+
+```BibTeX
+@inproceedings{duan2022revisiting,
+ title={Revisiting skeleton-based action recognition},
+ author={Duan, Haodong and Zhao, Yue and Chen, Kai and Lin, Dahua and Dai, Bo},
+ booktitle={CVPR},
+ pages={2969--2978},
+ year={2022}
+}
+```
+
+## How to train RGBPoseConv3D (on NTURGB+D, for example)?
+
+#### Step 0. Data Preparation
+
+Besides the skeleton annotations, you also need RGB videos to train RGBPoseConv3D. You need to download them from the official website of [NTURGB+D](https://rose1.ntu.edu.sg/dataset/actionRecognition/) and put these videos in `$MMACTION2/data/nturgbd_raw`. After that, you need to use the provided script to compress the raw videos (from `1920x1080` to `960x540`) and switch the suffix to `.mp4`:
+
+```bash
+# That step is mandatory, unless you know how to modify the code & config to make it work for raw videos!
+python tools/data/skeleton/compress_nturgbd.py
+```
+
+After that, you will find processed videos in `$MMACTION2/data/nturgbd_videos`, named like `S001C001P001R001A001.mp4`.
+
+#### Step 1. Pretraining
+
+You first need to train the RGB-only and Pose-only model on the target dataset, the pretrained checkpoints will be used to initialize the RGBPoseConv3D model.
+
+You can either train these two models from scratch with provided configs files:
+
+```bash
+# We train each model for 180 epochs. By default, we use 8 GPUs.
+# Train the RGB-only model
+bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py 8
+# Train the Pose-only model
+bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py 8
+```
+
+or directly download and use the provided pretrain models:
+
+| Dataset | Config | Checkpoint | Top-1 (1 clip testing) | Top-1 (10 clip testing) |
+| :-----------: | :------------------------------------------------------------------: | :------------------------------------------------------------------------: | :--------------------: | :---------------------: |
+| NTURGB+D XSub | [rgb_config](/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py) | [rgb_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth) | 94.9 | 95.4 |
+| NTURGB+D XSub | [pose_config](/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py) | [pose_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth) | 93.1 | 93.5 |
+
+#### Step 2. Generate the initializing weight for RGBPoseConv3D
+
+You can use the provided [IPython notebook](/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb) to merge two pretrained models into a single `rgbpose_conv3d_init.pth`.
+
+You can do it your own or directly download and use the provided [rgbpose_conv3d_init.pth](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_init_20230228-09b7684b.pth).
+
+#### Step 3. Finetune RGBPoseConv3D
+
+You can use our provided config files to finetune RGBPoseConv3D, jointly with two modalities (RGB & Pose):
+
+```bash
+# We finetune RGBPoseConv3D for 20 epochs on NTURGB+D XSub (8 GPUs)
+bash tools/dist_train.sh configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py 8
+# After finetuning, you can test the model with the following command (8 GPUs)
+bash tools/dist_test.sh configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py $CKPT 8 --dump result.pkl
+```
+
+**Notes**
+
+1. We use linear scaling learning rate (`Initial LR` ∝ `Batch Size`). If you change the training batch size, remember to change the initial LR proportionally.
+
+2. Though optimized, multi-clip testing may consumes large amounts of time. For faster inference, you may change the test_pipeline to disable the multi-clip testing, this may lead to a small drop in recognition performance. Below is the guide:
+
+ ```python
+ test_pipeline = [
+ dict(type='MMUniformSampleFrames', clip_len=dict(RGB=8, Pose=32), num_clips=10, test_mode=True), # change `num_clips=10` to `num_clips=1`
+ dict(type='MMDecode'),
+ dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+ dict(type='Resize', scale=(256, 256), keep_ratio=False),
+ dict(type='GeneratePoseTarget', sigma=0.7, use_score=True, with_kp=True, with_limb=False, scaling=0.25),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs'))
+ ]
+ ```
+
+## Results
+
+On action recognition with multiple modalities (RGB & Pose), RGBPoseConv3D can achieve better recognition performance than the late fusion baseline.
+
+| Dataset | Fusion | Config | Checkpoint | RGB Stream Top-1
(1-clip / 10-clip) | Pose Stream Top-1
(1-clip / 10-clip) | 2 Stream Top-1 (1:1)
(1-clip / 10-clip) |
+| :-----------: | :-------------------: | :-------------------: | :------------------------: | :------------------------------------: | :-------------------------------------: | :----------------------------------------: |
+| NTURGB+D XSub | Late Fusion | [rgb_config](/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py)
[pose_config](/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py) | [rgb_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth)
[pose_ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth) | 94.9 / 95.4 | 93.1 / 93.5 | 96.0 / 96.2 |
+| NTURGB+D XSub | Early Fusion + Late Fusion | [config](/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_20230301-ac7b0e77.pth) | 96.2 / 96.4 | 96.0 / 96.2 | 96.6 / 96.8 |
+
+**Notes**
+
+For both `Late Fusion` and `Early Fusion + Late Fusion`, we combine the action scores based on two modalities with 1:1 ratio to get the final prediction.
diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb b/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb
new file mode 100644
index 0000000000..194ca28c31
--- /dev/null
+++ b/configs/skeleton/posec3d/rgbpose_conv3d/merge_pretrain.ipynb
@@ -0,0 +1,267 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import copy as cp\n",
+ "from collections import OrderedDict\n",
+ "\n",
+ "import torch\n",
+ "from mmengine.runner.checkpoint import _load_checkpoint\n",
+ "\n",
+ "from mmaction.utils import register_all_modules\n",
+ "from mmaction.registry import MODELS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "outputs": [],
+ "source": [
+ "backbone_cfg = dict(\n",
+ " type='RGBPoseConv3D',\n",
+ " speed_ratio=4,\n",
+ " channel_ratio=4,\n",
+ " rgb_pathway=dict(\n",
+ " num_stages=4,\n",
+ " lateral=True,\n",
+ " lateral_infl=1,\n",
+ " lateral_activate=[0, 0, 1, 1],\n",
+ " fusion_kernel=7,\n",
+ " base_channels=64,\n",
+ " conv1_kernel=(1, 7, 7),\n",
+ " inflate=(0, 0, 1, 1),\n",
+ " with_pool2=False),\n",
+ " pose_pathway=dict(\n",
+ " num_stages=3,\n",
+ " stage_blocks=(4, 6, 3),\n",
+ " lateral=True,\n",
+ " lateral_inv=True,\n",
+ " lateral_infl=16,\n",
+ " lateral_activate=(0, 1, 1),\n",
+ " fusion_kernel=7,\n",
+ " in_channels=17,\n",
+ " base_channels=32,\n",
+ " out_indices=(2, ),\n",
+ " conv1_kernel=(1, 7, 7),\n",
+ " conv1_stride_s=1,\n",
+ " conv1_stride_t=1,\n",
+ " pool1_stride_s=1,\n",
+ " pool1_stride_t=1,\n",
+ " inflate=(0, 1, 1),\n",
+ " spatial_strides=(2, 2, 2),\n",
+ " temporal_strides=(1, 1, 1),\n",
+ " dilations=(1, 1, 1),\n",
+ " with_pool2=False))\n",
+ "head_cfg = dict(\n",
+ " type='RGBPoseHead',\n",
+ " num_classes=60,\n",
+ " in_channels=[2048, 512],\n",
+ " average_clips='prob')\n",
+ "model_cfg = dict(\n",
+ " type='Recognizer3D',\n",
+ " backbone=backbone_cfg,\n",
+ " cls_head=head_cfg)\n",
+ "\n",
+ "register_all_modules()\n",
+ "model = MODELS.build(model_cfg)"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "outputs": [],
+ "source": [
+ "# set your paths of the pretrained weights here\n",
+ "rgb_filepath = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230228-576b9f86.pth'\n",
+ "pose_filepath = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230228-fa40054e.pth'"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230226-8bd9d8df.pth\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgb_only_20230226-8bd9d8df.pth\" to C:\\Users\\wxDai/.cache\\torch\\hub\\checkpoints\\rgb_only_20230226-8bd9d8df.pth\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loads checkpoint by http backend from path: https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230226-fa40054e.pth\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Downloading: \"https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/pose_only_20230226-fa40054e.pth\" to C:\\Users\\wxDai/.cache\\torch\\hub\\checkpoints\\pose_only_20230226-fa40054e.pth\n"
+ ]
+ }
+ ],
+ "source": [
+ "rgb_ckpt = _load_checkpoint(rgb_filepath, map_location='cpu')['state_dict']\n",
+ "pose_ckpt = _load_checkpoint(pose_filepath, map_location='cpu')['state_dict']"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "outputs": [],
+ "source": [
+ "rgb_ckpt = {k.replace('backbone', 'backbone.rgb_path').replace('fc_cls', 'fc_rgb'): v for k, v in rgb_ckpt.items()}\n",
+ "pose_ckpt = {k.replace('backbone', 'backbone.pose_path').replace('fc_cls', 'fc_pose'): v for k, v in pose_ckpt.items()}"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "outputs": [],
+ "source": [
+ "old_ckpt = {}\n",
+ "old_ckpt.update(rgb_ckpt)\n",
+ "old_ckpt.update(pose_ckpt)"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "outputs": [],
+ "source": [
+ "# The difference is in dim-1\n",
+ "def padding(weight, new_shape):\n",
+ " new_weight = weight.new_zeros(new_shape)\n",
+ " new_weight[:, :weight.shape[1]] = weight\n",
+ " return new_weight"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "outputs": [],
+ "source": [
+ "ckpt = cp.deepcopy(old_ckpt)\n",
+ "name = 'backbone.rgb_path.layer3.0.conv1.conv.weight'\n",
+ "ckpt[name] = padding(ckpt[name], (256, 640, 3, 1, 1))\n",
+ "name = 'backbone.rgb_path.layer3.0.downsample.conv.weight'\n",
+ "ckpt[name] = padding(ckpt[name], (1024, 640, 1, 1, 1))\n",
+ "name = 'backbone.rgb_path.layer4.0.conv1.conv.weight'\n",
+ "ckpt[name] = padding(ckpt[name], (512, 1280, 3, 1, 1))\n",
+ "name = 'backbone.rgb_path.layer4.0.downsample.conv.weight'\n",
+ "ckpt[name] = padding(ckpt[name], (2048, 1280, 1, 1, 1))\n",
+ "name = 'backbone.pose_path.layer2.0.conv1.conv.weight'\n",
+ "ckpt[name] = padding(ckpt[name], (64, 160, 3, 1, 1))\n",
+ "name = 'backbone.pose_path.layer2.0.downsample.conv.weight'\n",
+ "ckpt[name] = padding(ckpt[name], (256, 160, 1, 1, 1))\n",
+ "name = 'backbone.pose_path.layer3.0.conv1.conv.weight'\n",
+ "ckpt[name] = padding(ckpt[name], (128, 320, 3, 1, 1))\n",
+ "name = 'backbone.pose_path.layer3.0.downsample.conv.weight'\n",
+ "ckpt[name] = padding(ckpt[name], (512, 320, 1, 1, 1))\n",
+ "ckpt = OrderedDict(ckpt)\n",
+ "torch.save({'state_dict': ckpt}, 'rgbpose_conv3d_init.pth')"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "outputs": [
+ {
+ "data": {
+ "text/plain": "_IncompatibleKeys(missing_keys=['backbone.rgb_path.layer2_lateral.conv.weight', 'backbone.rgb_path.layer3_lateral.conv.weight', 'backbone.pose_path.layer1_lateral.conv.weight', 'backbone.pose_path.layer1_lateral.bn.weight', 'backbone.pose_path.layer1_lateral.bn.bias', 'backbone.pose_path.layer1_lateral.bn.running_mean', 'backbone.pose_path.layer1_lateral.bn.running_var', 'backbone.pose_path.layer2_lateral.conv.weight', 'backbone.pose_path.layer2_lateral.bn.weight', 'backbone.pose_path.layer2_lateral.bn.bias', 'backbone.pose_path.layer2_lateral.bn.running_mean', 'backbone.pose_path.layer2_lateral.bn.running_var'], unexpected_keys=[])"
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model.load_state_dict(ckpt, strict=False)"
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ }
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py b/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py
new file mode 100644
index 0000000000..ad413da6a6
--- /dev/null
+++ b/configs/skeleton/posec3d/rgbpose_conv3d/pose_only.py
@@ -0,0 +1,127 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+model = dict(
+ type='Recognizer3D',
+ backbone=dict(
+ type='ResNet3dSlowOnly',
+ in_channels=17,
+ base_channels=32,
+ num_stages=3,
+ out_indices=(2, ),
+ stage_blocks=(4, 6, 3),
+ conv1_stride_s=1,
+ pool1_stride_s=1,
+ inflate=(0, 1, 1),
+ spatial_strides=(2, 2, 2),
+ temporal_strides=(1, 1, 1),
+ dilations=(1, 1, 1)),
+ cls_head=dict(
+ type='I3DHead',
+ in_channels=512,
+ num_classes=60,
+ dropout_ratio=0.5,
+ average_clips='prob'))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
+right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
+train_pipeline = [
+ dict(type='UniformSampleFrames', clip_len=32),
+ dict(type='PoseDecode'),
+ dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),
+ dict(type='Resize', scale=(64, 64), keep_ratio=False),
+ dict(type='RandomResizedCrop', area_range=(0.56, 1.0)),
+ dict(type='Resize', scale=(56, 56), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp),
+ dict(type='GeneratePoseTarget', with_kp=True, with_limb=False),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
+ dict(type='PackActionInputs')
+]
+val_pipeline = [
+ dict(type='UniformSampleFrames', clip_len=32, num_clips=1, test_mode=True),
+ dict(type='PoseDecode'),
+ dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),
+ dict(type='Resize', scale=(64, 64), keep_ratio=False),
+ dict(type='GeneratePoseTarget', with_kp=True, with_limb=False),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
+ dict(type='PackActionInputs')
+]
+test_pipeline = [
+ dict(
+ type='UniformSampleFrames', clip_len=32, num_clips=10, test_mode=True),
+ dict(type='PoseDecode'),
+ dict(type='PoseCompact', hw_ratio=1., allow_imgpad=True),
+ dict(type='Resize', scale=(64, 64), keep_ratio=False),
+ dict(
+ type='GeneratePoseTarget',
+ with_kp=True,
+ with_limb=False,
+ left_kp=left_kp,
+ right_kp=right_kp),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='RepeatDataset',
+ times=10,
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ split='xsub_train',
+ pipeline=train_pipeline)))
+val_dataloader = dict(
+ batch_size=16,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ split='xsub_val',
+ pipeline=val_pipeline,
+ test_mode=True))
+test_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ split='xsub_val',
+ pipeline=test_pipeline,
+ test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=18, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0,
+ T_max=18,
+ by_epoch=True,
+ convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003),
+ clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py b/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py
new file mode 100644
index 0000000000..331badaf8d
--- /dev/null
+++ b/configs/skeleton/posec3d/rgbpose_conv3d/rgb_only.py
@@ -0,0 +1,126 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+model = dict(
+ type='Recognizer3D',
+ backbone=dict(
+ type='ResNet3dSlowOnly',
+ depth=50,
+ conv1_kernel=(1, 7, 7),
+ inflate=(0, 0, 1, 1)),
+ cls_head=dict(
+ type='I3DHead',
+ in_channels=2048,
+ num_classes=60,
+ dropout_ratio=0.5,
+ average_clips='prob'),
+ data_preprocessor=dict(
+ type='ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'))
+
+dataset_type = 'PoseDataset'
+data_root = 'data/nturgbd_videos/'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+
+train_pipeline = [
+ dict(type='MMUniformSampleFrames', clip_len=dict(RGB=8), num_clips=1),
+ dict(type='MMDecode'),
+ dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+ dict(type='Resize', scale=(256, 256), keep_ratio=False),
+ dict(type='RandomResizedCrop', area_range=(0.56, 1.0)),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+val_pipeline = [
+ dict(
+ type='MMUniformSampleFrames',
+ clip_len=dict(RGB=8),
+ num_clips=1,
+ test_mode=True),
+ dict(type='MMDecode'),
+ dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+test_pipeline = [
+ dict(
+ type='MMUniformSampleFrames',
+ clip_len=dict(RGB=8),
+ num_clips=10,
+ test_mode=True),
+ dict(type='MMDecode'),
+ dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=12,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='RepeatDataset',
+ times=10,
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ data_prefix=dict(video=data_root),
+ split='xsub_train',
+ pipeline=train_pipeline)))
+val_dataloader = dict(
+ batch_size=12,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ data_prefix=dict(video=data_root),
+ split='xsub_val',
+ pipeline=val_pipeline,
+ test_mode=True))
+test_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ data_prefix=dict(video=data_root),
+ split='xsub_val',
+ pipeline=test_pipeline,
+ test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=18, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0,
+ T_max=18,
+ by_epoch=True,
+ convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.15, momentum=0.9, weight_decay=0.0001),
+ clip_grad=dict(max_norm=40, norm_type=2))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (12 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=96)
diff --git a/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py b/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py
new file mode 100644
index 0000000000..d303699f90
--- /dev/null
+++ b/configs/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d.py
@@ -0,0 +1,190 @@
+_base_ = '../../../_base_/default_runtime.py'
+
+# model_cfg
+backbone_cfg = dict(
+ type='RGBPoseConv3D',
+ speed_ratio=4,
+ channel_ratio=4,
+ rgb_pathway=dict(
+ num_stages=4,
+ lateral=True,
+ lateral_infl=1,
+ lateral_activate=[0, 0, 1, 1],
+ fusion_kernel=7,
+ base_channels=64,
+ conv1_kernel=(1, 7, 7),
+ inflate=(0, 0, 1, 1),
+ with_pool2=False),
+ pose_pathway=dict(
+ num_stages=3,
+ stage_blocks=(4, 6, 3),
+ lateral=True,
+ lateral_inv=True,
+ lateral_infl=16,
+ lateral_activate=(0, 1, 1),
+ fusion_kernel=7,
+ in_channels=17,
+ base_channels=32,
+ out_indices=(2, ),
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_s=1,
+ conv1_stride_t=1,
+ pool1_stride_s=1,
+ pool1_stride_t=1,
+ inflate=(0, 1, 1),
+ spatial_strides=(2, 2, 2),
+ temporal_strides=(1, 1, 1),
+ dilations=(1, 1, 1),
+ with_pool2=False))
+head_cfg = dict(
+ type='RGBPoseHead',
+ num_classes=60,
+ in_channels=[2048, 512],
+ loss_components=['rgb', 'pose'],
+ loss_weights=[1., 1.],
+ average_clips='prob')
+data_preprocessor = dict(
+ type='MultiModalDataPreprocessor',
+ preprocessors=dict(
+ imgs=dict(
+ type='ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ heatmap_imgs=dict(type='ActionDataPreprocessor')))
+model = dict(
+ type='MMRecognizer3D',
+ backbone=backbone_cfg,
+ cls_head=head_cfg,
+ data_preprocessor=data_preprocessor)
+
+dataset_type = 'PoseDataset'
+data_root = 'data/nturgbd_videos/'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
+right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
+train_pipeline = [
+ dict(
+ type='MMUniformSampleFrames',
+ clip_len=dict(RGB=8, Pose=32),
+ num_clips=1),
+ dict(type='MMDecode'),
+ dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+ dict(type='Resize', scale=(256, 256), keep_ratio=False),
+ dict(type='RandomResizedCrop', area_range=(0.56, 1.0)),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5, left_kp=left_kp, right_kp=right_kp),
+ dict(
+ type='GeneratePoseTarget',
+ sigma=0.7,
+ use_score=True,
+ with_kp=True,
+ with_limb=False,
+ scaling=0.25),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs'))
+]
+val_pipeline = [
+ dict(
+ type='MMUniformSampleFrames',
+ clip_len=dict(RGB=8, Pose=32),
+ num_clips=1,
+ test_mode=True),
+ dict(type='MMDecode'),
+ dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+ dict(type='Resize', scale=(256, 256), keep_ratio=False),
+ dict(
+ type='GeneratePoseTarget',
+ sigma=0.7,
+ use_score=True,
+ with_kp=True,
+ with_limb=False,
+ scaling=0.25),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs'))
+]
+test_pipeline = [
+ dict(
+ type='MMUniformSampleFrames',
+ clip_len=dict(RGB=8, Pose=32),
+ num_clips=10,
+ test_mode=True),
+ dict(type='MMDecode'),
+ dict(type='MMCompact', hw_ratio=1., allow_imgpad=True),
+ dict(type='Resize', scale=(256, 256), keep_ratio=False),
+ dict(
+ type='GeneratePoseTarget',
+ sigma=0.7,
+ use_score=True,
+ with_kp=True,
+ with_limb=False,
+ scaling=0.25),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs', collect_keys=('imgs', 'heatmap_imgs'))
+]
+
+train_dataloader = dict(
+ batch_size=6,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ data_prefix=dict(video=data_root),
+ split='xsub_train',
+ pipeline=train_pipeline))
+val_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ split='xsub_val',
+ data_prefix=dict(video=data_root),
+ pipeline=val_pipeline,
+ test_mode=True))
+test_dataloader = dict(
+ batch_size=1,
+ num_workers=8,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ split='xsub_val',
+ data_prefix=dict(video=data_root),
+ pipeline=test_pipeline,
+ test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.0075, momentum=0.9, weight_decay=0.0001),
+ clip_grad=dict(max_norm=40, norm_type=2))
+
+param_scheduler = [
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=20,
+ by_epoch=True,
+ milestones=[12, 16],
+ gamma=0.1)
+]
+
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/skeleton/posec3d/rgbpose_conv3d/rgbpose_conv3d_init_20230228-09b7684b.pth' # noqa: E501
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (6 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=48)
diff --git a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py
index 123db1ee1f..e213e3319c 100644
--- a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py
+++ b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_hmdb51-split1-keypoint.py
@@ -28,7 +28,7 @@
test_cfg=None)
dataset_type = 'PoseDataset'
-ann_file = 'data/posec3d/hmdb51.pkl'
+ann_file = 'data/skeleton/hmdb51_2d.pkl'
left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
train_pipeline = [
@@ -45,7 +45,7 @@
use_score=True,
with_kp=True,
with_limb=False),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
val_pipeline = [
@@ -60,7 +60,7 @@
use_score=True,
with_kp=True,
with_limb=False),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
test_pipeline = [
@@ -79,7 +79,7 @@
double=True,
left_kp=left_kp,
right_kp=right_kp),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
diff --git a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py
index 547f57c052..c100754fa5 100644
--- a/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py
+++ b/configs/skeleton/posec3d/slowonly_kinetics400-pretrained-r50_8xb16-u48-120e_ucf101-split1-keypoint.py
@@ -28,7 +28,7 @@
test_cfg=None)
dataset_type = 'PoseDataset'
-ann_file = 'data/posec3d/ucf101.pkl'
+ann_file = 'data/skeleton/ucf101_2d.pkl'
left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
train_pipeline = [
@@ -45,7 +45,7 @@
use_score=True,
with_kp=True,
with_limb=False),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
val_pipeline = [
@@ -60,7 +60,7 @@
use_score=True,
with_kp=True,
with_limb=False),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
test_pipeline = [
@@ -79,7 +79,7 @@
double=True,
left_kp=left_kp,
right_kp=right_kp),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py
index c893f69df3..8517870d1c 100644
--- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py
+++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-keypoint.py
@@ -23,13 +23,10 @@
num_classes=99,
spatial_type='avg',
dropout_ratio=0.5,
- average_clips='prob'),
- train_cfg=None,
- test_cfg=None)
+ average_clips='prob'))
dataset_type = 'PoseDataset'
-ann_file_train = 'data/posec3d/gym_train.pkl'
-ann_file_val = 'data/posec3d/gym_val.pkl'
+ann_file = 'data/skeleton/gym_2d.pkl'
left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
train_pipeline = [
@@ -46,7 +43,7 @@
use_score=True,
with_kp=True,
with_limb=False),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
val_pipeline = [
@@ -61,7 +58,7 @@
use_score=True,
with_kp=True,
with_limb=False),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
test_pipeline = [
@@ -80,7 +77,7 @@
double=True,
left_kp=left_kp,
right_kp=right_kp),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
@@ -90,7 +87,13 @@
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
- type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline))
+ type='RepeatDataset',
+ times=10,
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ split='train',
+ pipeline=train_pipeline)))
val_dataloader = dict(
batch_size=16,
num_workers=8,
@@ -98,7 +101,8 @@
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
- ann_file=ann_file_val,
+ ann_file=ann_file,
+ split='val',
pipeline=val_pipeline,
test_mode=True))
test_dataloader = dict(
@@ -108,7 +112,8 @@
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
- ann_file=ann_file_val,
+ ann_file=ann_file,
+ split='val',
pipeline=test_pipeline,
test_mode=True))
@@ -116,7 +121,7 @@
test_evaluator = val_evaluator
train_cfg = dict(
- type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10)
+ type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
@@ -124,7 +129,7 @@
dict(
type='CosineAnnealingLR',
eta_min=0,
- T_max=240,
+ T_max=24,
by_epoch=True,
convert_to_iter_based=True)
]
@@ -132,5 +137,3 @@
optim_wrapper = dict(
optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003),
clip_grad=dict(max_norm=40, norm_type=2))
-
-default_hooks = dict(checkpoint=dict(interval=10, max_keep_ckpts=3))
diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py
index 34764a726e..0ab9263951 100644
--- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py
+++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_gym-limb.py
@@ -23,18 +23,17 @@
num_classes=99,
spatial_type='avg',
dropout_ratio=0.5,
- average_clips='prob'),
- train_cfg=None,
- test_cfg=None)
+ average_clips='prob'))
dataset_type = 'PoseDataset'
-ann_file_train = 'data/posec3d/gym_train.pkl'
-ann_file_val = 'data/posec3d/gym_val.pkl'
+ann_file = 'data/skeleton/gym_2d.pkl'
left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
skeletons = [[0, 5], [0, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11],
[11, 13], [13, 15], [6, 12], [12, 14], [14, 16], [0, 1], [0, 2],
[1, 3], [2, 4], [11, 12]]
+left_limb = [0, 2, 3, 6, 7, 8, 12, 14]
+right_limb = [1, 4, 5, 9, 10, 11, 13, 15]
train_pipeline = [
dict(type='UniformSampleFrames', clip_len=48),
dict(type='PoseDecode'),
@@ -50,7 +49,7 @@
with_kp=False,
with_limb=True,
skeletons=skeletons),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
val_pipeline = [
@@ -66,7 +65,7 @@
with_kp=False,
with_limb=True,
skeletons=skeletons),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
test_pipeline = [
@@ -85,8 +84,10 @@
skeletons=skeletons,
double=True,
left_kp=left_kp,
- right_kp=right_kp),
- dict(type='FormatShape', input_format='NCTHW'),
+ right_kp=right_kp,
+ left_limb=left_limb,
+ right_limb=right_limb),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
@@ -96,7 +97,13 @@
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
- type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline))
+ type='RepeatDataset',
+ times=10,
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ split='train',
+ pipeline=train_pipeline))),
val_dataloader = dict(
batch_size=16,
num_workers=8,
@@ -104,7 +111,8 @@
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
- ann_file=ann_file_val,
+ ann_file=ann_file,
+ split='val',
pipeline=val_pipeline,
test_mode=True))
test_dataloader = dict(
@@ -114,7 +122,8 @@
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
- ann_file=ann_file_val,
+ ann_file=ann_file,
+ split='val',
pipeline=test_pipeline,
test_mode=True))
@@ -122,7 +131,7 @@
test_evaluator = val_evaluator
train_cfg = dict(
- type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10)
+ type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
@@ -130,7 +139,7 @@
dict(
type='CosineAnnealingLR',
eta_min=0,
- T_max=240,
+ T_max=24,
by_epoch=True,
convert_to_iter_based=True)
]
@@ -138,5 +147,3 @@
optim_wrapper = dict(
optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003),
clip_grad=dict(max_norm=40, norm_type=2))
-
-default_hooks = dict(checkpoint=dict(interval=10, max_keep_ckpts=3))
diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py
index 2194139f5e..c4915d4d2e 100644
--- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py
+++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint.py
@@ -21,15 +21,11 @@
type='I3DHead',
in_channels=512,
num_classes=60,
- spatial_type='avg',
dropout_ratio=0.5,
- average_clips='prob'),
- train_cfg=None,
- test_cfg=None)
+ average_clips='prob'))
dataset_type = 'PoseDataset'
-ann_file_train = 'data/posec3d/ntu60_xsub_train.pkl'
-ann_file_val = 'data/posec3d/ntu60_xsub_val.pkl'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
train_pipeline = [
@@ -46,7 +42,7 @@
use_score=True,
with_kp=True,
with_limb=False),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
val_pipeline = [
@@ -61,7 +57,7 @@
use_score=True,
with_kp=True,
with_limb=False),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
test_pipeline = [
@@ -80,7 +76,7 @@
double=True,
left_kp=left_kp,
right_kp=right_kp),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
@@ -90,7 +86,13 @@
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
- type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline))
+ type='RepeatDataset',
+ times=10,
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ split='xsub_train',
+ pipeline=train_pipeline)))
val_dataloader = dict(
batch_size=16,
num_workers=8,
@@ -98,7 +100,8 @@
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
- ann_file=ann_file_val,
+ ann_file=ann_file,
+ split='xsub_val',
pipeline=val_pipeline,
test_mode=True))
test_dataloader = dict(
@@ -108,15 +111,16 @@
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
- ann_file=ann_file_val,
+ ann_file=ann_file,
+ split='xsub_val',
pipeline=test_pipeline,
test_mode=True))
-val_evaluator = dict(type='AccMetric')
+val_evaluator = [dict(type='AccMetric')]
test_evaluator = val_evaluator
train_cfg = dict(
- type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10)
+ type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
@@ -124,7 +128,7 @@
dict(
type='CosineAnnealingLR',
eta_min=0,
- T_max=240,
+ T_max=24,
by_epoch=True,
convert_to_iter_based=True)
]
@@ -132,5 +136,3 @@
optim_wrapper = dict(
optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003),
clip_grad=dict(max_norm=40, norm_type=2))
-
-default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
diff --git a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py
index 7eca1463ee..0f4f11f3a0 100644
--- a/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py
+++ b/configs/skeleton/posec3d/slowonly_r50_8xb16-u48-240e_ntu60-xsub-limb.py
@@ -21,20 +21,18 @@
type='I3DHead',
in_channels=512,
num_classes=60,
- spatial_type='avg',
dropout_ratio=0.5,
- average_clips='prob'),
- train_cfg=None,
- test_cfg=None)
+ average_clips='prob'))
dataset_type = 'PoseDataset'
-ann_file_train = 'data/posec3d/ntu60_xsub_train.pkl'
-ann_file_val = 'data/posec3d/ntu60_xsub_val.pkl'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
left_kp = [1, 3, 5, 7, 9, 11, 13, 15]
right_kp = [2, 4, 6, 8, 10, 12, 14, 16]
skeletons = [[0, 5], [0, 6], [5, 7], [7, 9], [6, 8], [8, 10], [5, 11],
[11, 13], [13, 15], [6, 12], [12, 14], [14, 16], [0, 1], [0, 2],
[1, 3], [2, 4], [11, 12]]
+left_limb = [0, 2, 3, 6, 7, 8, 12, 14]
+right_limb = [1, 4, 5, 9, 10, 11, 13, 15]
train_pipeline = [
dict(type='UniformSampleFrames', clip_len=48),
dict(type='PoseDecode'),
@@ -50,7 +48,7 @@
with_kp=False,
with_limb=True,
skeletons=skeletons),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
val_pipeline = [
@@ -66,7 +64,7 @@
with_kp=False,
with_limb=True,
skeletons=skeletons),
- dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
test_pipeline = [
@@ -84,9 +82,9 @@
with_limb=True,
skeletons=skeletons,
double=True,
- left_kp=left_kp,
- right_kp=right_kp),
- dict(type='FormatShape', input_format='NCTHW'),
+ left_limb=left_limb,
+ right_limb=right_limb),
+ dict(type='FormatShape', input_format='NCTHW_Heatmap'),
dict(type='PackActionInputs')
]
@@ -96,7 +94,13 @@
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
- type=dataset_type, ann_file=ann_file_train, pipeline=train_pipeline))
+ type='RepeatDataset',
+ times=10,
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ split='xsub_train',
+ pipeline=train_pipeline)))
val_dataloader = dict(
batch_size=16,
num_workers=8,
@@ -104,7 +108,8 @@
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
- ann_file=ann_file_val,
+ ann_file=ann_file,
+ split='xsub_val',
pipeline=val_pipeline,
test_mode=True))
test_dataloader = dict(
@@ -114,15 +119,16 @@
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
- ann_file=ann_file_val,
+ ann_file=ann_file,
+ split='xsub_val',
pipeline=test_pipeline,
test_mode=True))
-val_evaluator = dict(type='AccMetric')
+val_evaluator = [dict(type='AccMetric')]
test_evaluator = val_evaluator
train_cfg = dict(
- type='EpochBasedTrainLoop', max_epochs=240, val_begin=1, val_interval=10)
+ type='EpochBasedTrainLoop', max_epochs=24, val_begin=1, val_interval=1)
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
@@ -130,7 +136,7 @@
dict(
type='CosineAnnealingLR',
eta_min=0,
- T_max=240,
+ T_max=24,
by_epoch=True,
convert_to_iter_based=True)
]
@@ -138,5 +144,3 @@
optim_wrapper = dict(
optimizer=dict(type='SGD', lr=0.2, momentum=0.9, weight_decay=0.0003),
clip_grad=dict(max_norm=40, norm_type=2))
-
-default_hooks = dict(checkpoint=dict(max_keep_ckpts=3))
diff --git a/configs/skeleton/stgcn/README.md b/configs/skeleton/stgcn/README.md
index dee9f46dfb..c8d23a1a05 100644
--- a/configs/skeleton/stgcn/README.md
+++ b/configs/skeleton/stgcn/README.md
@@ -63,7 +63,7 @@ Dynamics of human body skeletons convey significant information for human action
| | four-stream | | | 86.19 | | | | | | |
1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size.
-2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/user_guides/useful_tools.md#multi-stream-fusion).
+2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/advanced_guides/useful_tools.md#multi-stream-fusion).
## Train
@@ -80,7 +80,7 @@ python tools/train.py configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xs
--seed 0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -97,7 +97,7 @@ python tools/test.py configs/skeleton/stgcn/stgcn_8xb16-joint-u100-80e_ntu60-xsu
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/configs/skeleton/stgcnpp/README.md b/configs/skeleton/stgcnpp/README.md
index 655b067a60..3eec28036c 100644
--- a/configs/skeleton/stgcnpp/README.md
+++ b/configs/skeleton/stgcnpp/README.md
@@ -35,7 +35,7 @@ We present PYSKL: an open-source toolbox for skeleton-based action recognition b
| | four-stream | | | 91.87 | | | | | | |
1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size, and the original batch size.
-2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/user_guides/useful_tools.md#multi-stream-fusion).
+2. For two-stream fusion, we use **joint : bone = 1 : 1**. For four-stream fusion, we use **joint : joint-motion : bone : bone-motion = 2 : 1 : 2 : 1**. For more details about multi-stream fusion, please refer to this [tutorial](/docs/en/advanced_guides/useful_tools.md#multi-stream-fusion).
## Train
@@ -52,7 +52,7 @@ python tools/train.py configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu6
--seed 0 --deterministic
```
-For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Test
@@ -69,7 +69,7 @@ python tools/test.py configs/skeleton/stgcnpp/stgcnpp_8xb16-joint-u100-80e_ntu60
checkpoints/SOME_CHECKPOINT.pth --dump result.pkl
```
-For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).
+For more details, you can refer to the **Test** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md).
## Citation
diff --git a/demo/README.md b/demo/README.md
index f3f4ba1db9..447789d37d 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -8,6 +8,7 @@
- [Webcam demo](#webcam-demo): A demo script to implement real-time action recognition from a web camera.
- [Skeleton-based Action Recognition Demo](#skeleton-based-action-recognition-demo): A demo script to predict the skeleton-based action recognition result using a single video.
- [SpatioTemporal Action Detection Video Demo](#spatiotemporal-action-detection-video-demo): A demo script to predict the spatiotemporal action detection result using a single video.
+- [SpatioTemporal Action Detection ONNX Video Demo](#spatiotemporal-action-detection-onnx-video-demo): A demo script to predict the SpatioTemporal Action Detection result using the onnx file instead of building the PyTorch models.
- [Inferencer Demo](#inferencer): A demo script to implement fast predict for video analysis tasks based on unified inferencer interface.
## Modify configs through script arguments
@@ -309,6 +310,75 @@ python demo/demo_spatiotemporal_det.py demo/demo.mp4 demo/demo_spatiotemporal_de
--output-fps 6
```
+## SpatioTemporal Action Detection ONNX Video Demo
+
+MMAction2 provides a demo script to predict the SpatioTemporal Action Detection result using the onnx file instead of building the PyTorch models.
+
+```shell
+python demo/demo_spatiotemporal_det_onnx.py --video ${VIDEO_FILE} \
+ [--out-filename ${OUTPUT_FILENAME}] \
+ [--config ${SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE}] \
+ [--onnx-file ${SPATIOTEMPORAL_ACTION_DETECTION_ONNX_FILE}] \
+ [--det-config ${HUMAN_DETECTION_CONFIG_FILE}] \
+ [--det-checkpoint ${HUMAN_DETECTION_CHECKPOINT}] \
+ [--det-score-thr ${HUMAN_DETECTION_SCORE_THRESHOLD}] \
+ [--det-cat-id ${HUMAN_DETECTION_CATEGORY_ID}] \
+ [--action-score-thr ${ACTION_DETECTION_SCORE_THRESHOLD}] \
+ [--label-map ${LABEL_MAP}] \
+ [--device ${DEVICE}] \
+ [--short-side] ${SHORT_SIDE} \
+ [--predict-stepsize ${PREDICT_STEPSIZE}] \
+ [--output-stepsize ${OUTPUT_STEPSIZE}] \
+ [--output-fps ${OUTPUT_FPS}]
+```
+
+Optional arguments:
+
+- `OUTPUT_FILENAME`: Path to the output file which is a video format. Defaults to `demo/stdet_demo.mp4`.
+- `SPATIOTEMPORAL_ACTION_DETECTION_CONFIG_FILE`: The spatiotemporal action detection config file path.
+- `SPATIOTEMPORAL_ACTION_DETECTION_ONNX_FILE`: The spatiotemporal action detection onnx file.
+- `HUMAN_DETECTION_CONFIG_FILE`: The human detection config file path.
+- `HUMAN_DETECTION_CHECKPOINT`: The human detection checkpoint URL.
+- `HUMAN_DETECTION_SCORE_THRESHOLD`: The score threshold for human detection. Defaults to 0.9.
+- `HUMAN_DETECTION_CATEGORY_ID`: The category id for human detection. Defaults to 0.
+- `ACTION_DETECTION_SCORE_THRESHOLD`: The score threshold for action detection. Defaults to 0.5.
+- `LABEL_MAP`: The label map used. Defaults to `tools/data/ava/label_map.txt`.
+- `DEVICE`: Type of device to run the demo. Allowed values are cuda device like `cuda:0` or `cpu`. Defaults to `cuda:0`.
+- `SHORT_SIDE`: The short side used for frame extraction. Defaults to 256.
+- `PREDICT_STEPSIZE`: Make a prediction per N frames. Defaults to 8.
+- `OUTPUT_STEPSIZE`: Output 1 frame per N frames in the input video. Note that `PREDICT_STEPSIZE % OUTPUT_STEPSIZE == 0`. Defaults to 4.
+- `OUTPUT_FPS`: The FPS of demo video output. Defaults to 6.
+
+Examples:
+
+Assume that you are located at `$MMACTION2` .
+
+1. Export an onnx file given the config file and checkpoint.
+
+```shell
+python3 tools/deployment/export_onnx_stdet.py \
+ configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
+ https://download.openmmlab.com/mmaction/detection/ava/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb/slowonly_omnisource_pretrained_r101_8x8x1_20e_ava_rgb_20201217-16378594.pth \
+ --output_file slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx \
+ --num_frames 8
+```
+
+2. Use the Faster RCNN as the human detector, the generated `slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx` file as the action detector. Making predictions per 8 frames, and output 1 frame per 4 frames to the output video. The FPS of the output video is 4.
+
+```shell
+python demo/demo_spatiotemporal_det_onnx.py demo/demo.mp4 demo/demo_spatiotemporal_det.mp4 \
+ --config configs/detection/ava/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py \
+ --onnx-file slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.onnx \
+ --det-config demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py \
+ --det-checkpoint http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_2x_coco/faster_rcnn_r50_fpn_2x_coco_bbox_mAP-0.384_20200504_210434-a5d8aa15.pth \
+ --det-score-thr 0.9 \
+ --action-score-thr 0.5 \
+ --label-map tools/data/ava/label_map.txt \
+ --predict-stepsize 8 \
+ --output-stepsize 4 \
+ --output-fps 6
+```
+
## Inferencer
MMAction2 provides a demo script to implement fast prediction for video analysis tasks based on unified inferencer interface, currently only supports action recognition task.
diff --git a/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py
new file mode 100644
index 0000000000..934a3a5bc4
--- /dev/null
+++ b/demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+model = dict(
+ type='FasterRCNN',
+ _scope_='mmdet',
+ data_preprocessor=dict(
+ type='DetDataPreprocessor',
+ mean=[103.53, 116.28, 123.675],
+ std=[1.0, 1.0, 1.0],
+ bgr_to_rgb=False,
+ pad_size_divisor=32),
+ backbone=dict(
+ type='ResNet',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=1,
+ norm_cfg=dict(type='BN', requires_grad=False),
+ norm_eval=True,
+ style='caffe',
+ init_cfg=dict(
+ type='Pretrained',
+ checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=5),
+ rpn_head=dict(
+ type='RPNHead',
+ in_channels=256,
+ feat_channels=256,
+ anchor_generator=dict(
+ type='AnchorGenerator',
+ scales=[8],
+ ratios=[0.5, 1.0, 2.0],
+ strides=[4, 8, 16, 32, 64]),
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0.0, 0.0, 0.0, 0.0],
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+ roi_head=dict(
+ type='StandardRoIHead',
+ bbox_roi_extractor=dict(
+ type='SingleRoIExtractor',
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+ out_channels=256,
+ featmap_strides=[4, 8, 16, 32]),
+ bbox_head=dict(
+ type='Shared2FCBBoxHead',
+ in_channels=256,
+ fc_out_channels=1024,
+ roi_feat_size=7,
+ num_classes=1,
+ bbox_coder=dict(
+ type='DeltaXYWHBBoxCoder',
+ target_means=[0.0, 0.0, 0.0, 0.0],
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
+ reg_class_agnostic=False,
+ loss_cls=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+ train_cfg=dict(
+ rpn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.7,
+ neg_iou_thr=0.3,
+ min_pos_iou=0.3,
+ match_low_quality=True,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=256,
+ pos_fraction=0.5,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=False),
+ allowed_border=-1,
+ pos_weight=-1,
+ debug=False),
+ rpn_proposal=dict(
+ nms_pre=2000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ assigner=dict(
+ type='MaxIoUAssigner',
+ pos_iou_thr=0.5,
+ neg_iou_thr=0.5,
+ min_pos_iou=0.5,
+ match_low_quality=False,
+ ignore_iof_thr=-1),
+ sampler=dict(
+ type='RandomSampler',
+ num=512,
+ pos_fraction=0.25,
+ neg_pos_ub=-1,
+ add_gt_as_proposals=True),
+ pos_weight=-1,
+ debug=False)),
+ test_cfg=dict(
+ rpn=dict(
+ nms_pre=1000,
+ max_per_img=1000,
+ nms=dict(type='nms', iou_threshold=0.7),
+ min_bbox_size=0),
+ rcnn=dict(
+ score_thr=0.05,
+ nms=dict(type='nms', iou_threshold=0.5),
+ max_per_img=100)))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+file_client_args = dict(backend='disk')
+
+test_pipeline = [
+ dict(type='mmdet.LoadImageFromFile', file_client_args=file_client_args),
+ dict(type='mmdet.Resize', scale=(1333, 800), keep_ratio=True),
+ dict(
+ type='mmdet.PackDetInputs',
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+ 'scale_factor'))
+]
+
+test_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ drop_last=False,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type='CocoDataset',
+ data_root='data/coco/',
+ ann_file='annotations/instances_val2017.json',
+ data_prefix=dict(img='val2017/'),
+ test_mode=True,
+ pipeline=test_pipeline,
+ metainfo=dict(classes=('person', ), palette=[(220, 20, 60)])))
diff --git a/demo/demo_skeleton.py b/demo/demo_skeleton.py
index 57c84c90a3..3dc1fb215a 100644
--- a/demo/demo_skeleton.py
+++ b/demo/demo_skeleton.py
@@ -1,7 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
-import os.path as osp
-import shutil
+import tempfile
import cv2
import mmcv
@@ -128,7 +127,10 @@ def visualize(args, frames, data_samples, action_label):
def main():
args = parse_args()
- frame_paths, frames = frame_extract(args.video, args.short_side)
+
+ tmp_dir = tempfile.TemporaryDirectory()
+ frame_paths, frames = frame_extract(args.video, args.short_side,
+ tmp_dir.name)
num_frame = len(frame_paths)
h, w, _ = frames[0].shape
@@ -180,8 +182,7 @@ def main():
visualize(args, frames, pose_data_samples, action_label)
- tmp_frame_dir = osp.dirname(frame_paths[0])
- shutil.rmtree(tmp_frame_dir)
+ tmp_dir.cleanup()
if __name__ == '__main__':
diff --git a/demo/demo_spatiotemporal_det.py b/demo/demo_spatiotemporal_det.py
index 009a9475a6..0c5091dab2 100644
--- a/demo/demo_spatiotemporal_det.py
+++ b/demo/demo_spatiotemporal_det.py
@@ -1,9 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import copy as cp
-import os
-import os.path as osp
-import shutil
+import tempfile
import cv2
import mmcv
@@ -17,6 +15,7 @@
from mmaction.apis import detection_inference
from mmaction.registry import MODELS
from mmaction.structures import ActionDataSample
+from mmaction.utils import frame_extract
try:
import moviepy.editor as mpy
@@ -101,32 +100,6 @@ def visualize(frames, annotations, plate=plate_blue, max_num=5):
return frames_out
-def frame_extraction(video_path):
- """Extract frames given video_path.
-
- Args:
- video_path (str): The video_path.
- """
- # Load the video, extract frames into ./tmp/video_name
- target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
- os.makedirs(target_dir, exist_ok=True)
- # Should be able to handle videos up to several hours
- frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
- vid = cv2.VideoCapture(video_path)
- frames = []
- frame_paths = []
- flag, frame = vid.read()
- cnt = 0
- while flag:
- frames.append(frame)
- frame_path = frame_tmpl.format(cnt + 1)
- frame_paths.append(frame_path)
- cv2.imwrite(frame_path, frame)
- cnt += 1
- flag, frame = vid.read()
- return frame_paths, frames
-
-
def load_label_map(file_path):
"""Load Label Map.
@@ -259,7 +232,9 @@ def parse_args():
def main():
args = parse_args()
- frame_paths, original_frames = frame_extraction(args.video)
+ tmp_dir = tempfile.TemporaryDirectory()
+ frame_paths, original_frames = frame_extract(
+ args.video, out_dir=tmp_dir.name)
num_frame = len(frame_paths)
h, w, _ = original_frames[0].shape
@@ -378,7 +353,7 @@ def dense_timestamps(timestamps, n):
start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
new_frame_inds = np.arange(
len(timestamps) * n) * old_frame_interval / n + start
- return new_frame_inds.astype(np.int)
+ return new_frame_inds.astype(np.int64)
dense_n = int(args.predict_stepsize / args.output_stepsize)
frames = [
@@ -391,8 +366,7 @@ def dense_timestamps(timestamps, n):
fps=args.output_fps)
vid.write_videofile(args.out_filename)
- tmp_frame_dir = osp.dirname(frame_paths[0])
- shutil.rmtree(tmp_frame_dir)
+ tmp_dir.cleanup()
if __name__ == '__main__':
diff --git a/demo/demo_spatiotemporal_det_onnx.py b/demo/demo_spatiotemporal_det_onnx.py
new file mode 100644
index 0000000000..7c40e9c64e
--- /dev/null
+++ b/demo/demo_spatiotemporal_det_onnx.py
@@ -0,0 +1,356 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy as cp
+import tempfile
+
+import cv2
+import mmcv
+import mmengine
+import numpy as np
+import onnxruntime
+import torch
+from mmdet.structures.bbox import bbox2roi
+from mmengine import DictAction
+
+from mmaction.apis import detection_inference
+from mmaction.utils import frame_extract
+
+try:
+ import moviepy.editor as mpy
+except ImportError:
+ raise ImportError('Please install moviepy to enable output file')
+
+FONTFACE = cv2.FONT_HERSHEY_DUPLEX
+FONTSCALE = 0.5
+FONTCOLOR = (255, 255, 255) # BGR, white
+MSGCOLOR = (128, 128, 128) # BGR, gray
+THICKNESS = 1
+LINETYPE = 1
+
+
+def hex2color(h):
+ """Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
+ return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
+
+
+plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
+plate_blue = plate_blue.split('-')
+plate_blue = [hex2color(h) for h in plate_blue]
+plate_green = '004b23-006400-007200-008000-38b000-70e000'
+plate_green = plate_green.split('-')
+plate_green = [hex2color(h) for h in plate_green]
+
+
+def visualize(frames, annotations, plate=plate_blue, max_num=5):
+ """Visualize frames with predicted annotations.
+
+ Args:
+ frames (list[np.ndarray]): Frames for visualization, note that
+ len(frames) % len(annotations) should be 0.
+ annotations (list[list[tuple]]): The predicted results.
+ plate (str): The plate used for visualization. Default: plate_blue.
+ max_num (int): Max number of labels to visualize for a person box.
+ Default: 5.
+ Returns:
+ list[np.ndarray]: Visualized frames.
+ """
+
+ assert max_num + 1 <= len(plate)
+ plate = [x[::-1] for x in plate]
+ frames_out = cp.deepcopy(frames)
+ nf, na = len(frames), len(annotations)
+ assert nf % na == 0
+ nfpa = len(frames) // len(annotations)
+ anno = None
+ h, w, _ = frames[0].shape
+ scale_ratio = np.array([w, h, w, h])
+ for i in range(na):
+ anno = annotations[i]
+ if anno is None:
+ continue
+ for j in range(nfpa):
+ ind = i * nfpa + j
+ frame = frames_out[ind]
+ for ann in anno:
+ box = ann[0]
+ label = ann[1]
+ if not len(label):
+ continue
+ score = ann[2]
+ box = (box * scale_ratio).astype(np.int64)
+ st, ed = tuple(box[:2]), tuple(box[2:])
+ cv2.rectangle(frame, st, ed, plate[0], 2)
+ for k, lb in enumerate(label):
+ if k >= max_num:
+ break
+ text = abbrev(lb)
+ text = ': '.join([text, str(score[k])])
+ location = (0 + st[0], 18 + k * 18 + st[1])
+ textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
+ THICKNESS)[0]
+ textwidth = textsize[0]
+ diag0 = (location[0] + textwidth, location[1] - 14)
+ diag1 = (location[0], location[1] + 2)
+ cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
+ cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
+ FONTCOLOR, THICKNESS, LINETYPE)
+
+ return frames_out
+
+
+def load_label_map(file_path):
+ """Load Label Map.
+
+ Args:
+ file_path (str): The file path of label map.
+ Returns:
+ dict: The label map (int -> label name).
+ """
+ lines = open(file_path).readlines()
+ lines = [x.strip().split(': ') for x in lines]
+ return {int(x[0]): x[1] for x in lines}
+
+
+def abbrev(name):
+ """Get the abbreviation of label name:
+
+ 'take (an object) from (a person)' -> 'take ... from ...'
+ """
+ while name.find('(') != -1:
+ st, ed = name.find('('), name.find(')')
+ name = name[:st] + '...' + name[ed + 1:]
+ return name
+
+
+def pack_result(human_detection, result, img_h, img_w):
+ """Short summary.
+
+ Args:
+ human_detection (np.ndarray): Human detection result.
+ result (type): The predicted label of each human proposal.
+ img_h (int): The image height.
+ img_w (int): The image width.
+ Returns:
+ tuple: Tuple of human proposal, label name and label score.
+ """
+ human_detection[:, 0::2] /= img_w
+ human_detection[:, 1::2] /= img_h
+ results = []
+ if result is None:
+ return None
+ for prop, res in zip(human_detection, result):
+ res.sort(key=lambda x: -x[1])
+ results.append(
+ (prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
+ for x in res]))
+ return results
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='MMAction2 demo')
+ parser.add_argument('video', help='video file/url')
+ parser.add_argument('out_filename', help='output filename')
+ parser.add_argument(
+ '--config',
+ default=('configs/detection/ava_kinetics/slowonly_k700-pre-'
+ 'r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py'),
+ help='spatialtemporal detection model config file path')
+ parser.add_argument(
+ '--onnx-file', help='spatialtemporal detection onnx file path')
+
+ parser.add_argument(
+ '--det-config',
+ default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py',
+ help='human detection config file path (from mmdet)')
+ parser.add_argument(
+ '--det-checkpoint',
+ default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+ 'faster_rcnn_r50_fpn_2x_coco/'
+ 'faster_rcnn_r50_fpn_2x_coco_'
+ 'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
+ help='human detection checkpoint file/url')
+ parser.add_argument(
+ '--det-score-thr',
+ type=float,
+ default=0.9,
+ help='the threshold of human detection score')
+ parser.add_argument(
+ '--det-cat-id',
+ type=int,
+ default=0,
+ help='the category id for human detection')
+ parser.add_argument(
+ '--action-score-thr',
+ type=float,
+ default=0.5,
+ help='the threshold of human action score')
+ parser.add_argument(
+ '--label-map',
+ default='tools/data/ava/label_map.txt',
+ help='label map file')
+ parser.add_argument(
+ '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
+ parser.add_argument(
+ '--short-side',
+ type=int,
+ default=256,
+ help='specify the short-side length of the image')
+ parser.add_argument(
+ '--predict-stepsize',
+ default=8,
+ type=int,
+ help='give out a prediction per n frames')
+ parser.add_argument(
+ '--output-stepsize',
+ default=4,
+ type=int,
+ help=('show one frame per n frames in the demo, we should have: '
+ 'predict_stepsize % output_stepsize == 0'))
+ parser.add_argument(
+ '--output-fps',
+ default=6,
+ type=int,
+ help='the fps of demo video output')
+ parser.add_argument(
+ '--cfg-options',
+ nargs='+',
+ action=DictAction,
+ default={},
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file. For example, '
+ "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_args()
+
+ tmp_dir = tempfile.TemporaryDirectory()
+ frame_paths, original_frames = frame_extract(
+ args.video, out_dir=tmp_dir.name)
+ num_frame = len(frame_paths)
+ h, w, _ = original_frames[0].shape
+
+ # resize frames to shortside
+ new_w, new_h = mmcv.rescale_size((w, h), (args.short_side, np.Inf))
+ frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
+ w_ratio, h_ratio = new_w / w, new_h / h
+
+ # Get clip_len, frame_interval and calculate center index of each clip
+ config = mmengine.Config.fromfile(args.config)
+ config.merge_from_dict(args.cfg_options)
+ val_pipeline = config.val_pipeline
+
+ sampler = [x for x in val_pipeline if x['type'] == 'SampleAVAFrames'][0]
+ clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
+ window_size = clip_len * frame_interval
+ assert clip_len % 2 == 0, 'We would like to have an even clip_len'
+ # Note that it's 1 based here
+ timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
+ args.predict_stepsize)
+
+ # Load label_map
+ label_map = load_label_map(args.label_map)
+ try:
+ if config['data']['train']['custom_classes'] is not None:
+ label_map = {
+ id + 1: label_map[cls]
+ for id, cls in enumerate(config['data']['train']
+ ['custom_classes'])
+ }
+ except KeyError:
+ pass
+
+ # Get Human detection results
+ center_frames = [frame_paths[ind - 1] for ind in timestamps]
+
+ human_detections, _ = detection_inference(args.det_config,
+ args.det_checkpoint,
+ center_frames,
+ args.det_score_thr,
+ args.det_cat_id, args.device)
+ torch.cuda.empty_cache()
+ for i in range(len(human_detections)):
+ det = human_detections[i]
+ det[:, 0:4:2] *= w_ratio
+ det[:, 1:4:2] *= h_ratio
+ human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
+
+ # Build STDET model
+ session = onnxruntime.InferenceSession(args.onnx_file)
+
+ predictions = []
+
+ img_norm_cfg = dict(
+ mean=np.array(config.model.data_preprocessor.mean),
+ std=np.array(config.model.data_preprocessor.std),
+ to_rgb=False)
+
+ print('Performing SpatioTemporal Action Detection for each clip')
+ assert len(timestamps) == len(human_detections)
+ prog_bar = mmengine.ProgressBar(len(timestamps))
+ for timestamp, proposal in zip(timestamps, human_detections):
+ if proposal.shape[0] == 0:
+ predictions.append(None)
+ continue
+
+ start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
+ frame_inds = start_frame + np.arange(0, window_size, frame_interval)
+ frame_inds = list(frame_inds - 1)
+ imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
+ _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
+ # THWC -> CTHW -> 1CTHW
+ input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
+ rois = bbox2roi([proposal])
+
+ input_feed = {
+ 'input_tensor': input_array,
+ 'rois': rois.cpu().data.numpy()
+ }
+ outputs = session.run(['cls_score'], input_feed=input_feed)
+ logits = outputs[0]
+ scores = 1 / (1 + np.exp(-logits))
+
+ prediction = []
+ # N proposals
+ for i in range(proposal.shape[0]):
+ prediction.append([])
+ # Perform action score thr
+ for i in range(scores.shape[1]):
+ if i not in label_map:
+ continue
+ for j in range(proposal.shape[0]):
+ if scores[j, i] > args.action_score_thr:
+ prediction[j].append((label_map[i], scores[j, i].item()))
+ predictions.append(prediction)
+ prog_bar.update()
+
+ results = []
+ for human_detection, prediction in zip(human_detections, predictions):
+ results.append(pack_result(human_detection, prediction, new_h, new_w))
+
+ def dense_timestamps(timestamps, n):
+ """Make it nx frames."""
+ old_frame_interval = (timestamps[1] - timestamps[0])
+ start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
+ new_frame_inds = np.arange(
+ len(timestamps) * n) * old_frame_interval / n + start
+ return new_frame_inds.astype(np.int64)
+
+ dense_n = int(args.predict_stepsize / args.output_stepsize)
+ frames = [
+ cv2.imread(frame_paths[i - 1])
+ for i in dense_timestamps(timestamps, dense_n)
+ ]
+ print('Performing visualization')
+ vis_frames = visualize(frames, results)
+ vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
+ fps=args.output_fps)
+ vid.write_videofile(args.out_filename)
+
+ tmp_dir.cleanup()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 45c82cfcb7..6622f147ea 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -17,7 +17,7 @@ RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 lib
# Install MMCV
RUN pip install openmim
-RUN mim install mmengine "mmcv>=2.0rc1"
+RUN mim install mmengine "mmcv>=2.0.0"
# Install MMAction2
RUN conda clean --all
diff --git a/docs/en/advanced_guides/customize_dataset.md b/docs/en/advanced_guides/customize_dataset.md
new file mode 100644
index 0000000000..31a6e16b2b
--- /dev/null
+++ b/docs/en/advanced_guides/customize_dataset.md
@@ -0,0 +1,122 @@
+# Customize Datasets
+
+In this tutorial, we will introduce some methods about how to customize your own dataset by online conversion.
+
+- [Customize Datasets](#customize-datasets)
+ - [General understanding of the Dataset in MMAction2](#general-understanding-of-the-dataset-in-mmaction2)
+ - [Customize new datasets](#customize-new-datasets)
+ - [Customize keypoint format for PoseDataset](#customize-keypoint-format-for-posedataset)
+
+## General understanding of the Dataset in MMAction2
+
+MMAction2 provides specific Dataset class according to the task, e.g. `VideoDataset`/`RawframeDataset` for action recognition, `AVADataset` for spatio-temporal action detection, `PoseDataset` for skeleton-based action recognition. All these specific datasets only need to implement `get_data_info(self, idx)` to build a data list from the annotation file, while other functions are handled by the superclass. The following table shows the inherent relationship and the main function of the modules.
+
+| Class Name | Functions |
+| ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| MMAction2::VideoDataset | `load_data_list(self)`
Build data list from the annotation file. |
+| MMAction2::BaseActionDataset | `get_data_info(self, idx)`
Given the `idx`, return the corresponding data sample from data list |
+| MMEngine::BaseDataset | `__getitem__(self, idx)`
Given the `idx`, call `get_data_info` to get data sample, then call the `pipeline` to perform transforms and augmentation in `train_pipeline` or `val_pipeline` |
+
+## Customize new datasets
+
+For most scenarios, we don't need to customize a new dataset class, offline conversion is recommended way to use your data. But customizing a new dataset class is also easy in MMAction2. As above mentioned, a dataset for a specific task usually only needs to implement `load_data_list(self)` to generate the data list from the annotation file. It is worth noting that elements in the `data_list` are `dict` with fields required in the following pipeline.
+
+Take `VideoDataset` as an example, `train_pipeline`/`val_pipeline` requires `'filename'` in `DecordInit` and `'label'` in `PackActionInput`, so data samples in the data list have 2 fields: `'filename'` and `'label'`.
+you can refer to [customize pipeline](customize_pipeline.md) for more details about the pipeline.
+
+```
+data_list.append(dict(filename=filename, label=label))
+```
+
+While `AVADataset` is more complex, elements in the data list consist of several fields about video data, and it further overwrites `get_data_info(self, idx)` to convert keys, which are required in spatio-temporal action detection pipeline.
+
+```python
+
+class AVADataset(BaseActionDataset):
+ ...
+
+ def load_data_list(self) -> List[dict]:
+ ...
+ video_info = dict(
+ frame_dir=frame_dir,
+ video_id=video_id,
+ timestamp=int(timestamp),
+ img_key=img_key,
+ shot_info=shot_info,
+ fps=self._FPS,
+ ann=ann)
+ data_list.append(video_info)
+ data_list.append(video_info)
+ return data_list
+
+ def get_data_info(self, idx: int) -> dict:
+ ...
+ ann = data_info.pop('ann')
+ data_info['gt_bboxes'] = ann['gt_bboxes']
+ data_info['gt_labels'] = ann['gt_labels']
+ data_info['entity_ids'] = ann['entity_ids']
+ return data_info
+```
+
+## Customize keypoint format for PoseDataset
+
+MMAction2 currently supports three kinds of keypoint formats: `coco`, `nturgb+d` and `openpose`. If your use one of them, just specify the corresponding format in the following modules:
+
+For Graph Convolutional Networks, such as AAGCN, STGCN...
+
+- transform: argument `dataset` in `JointToBone`.
+- backbone: argument `graph_cfg` in Graph Convolutional Networks.
+
+And for PoseC3D:
+
+- transform: In `Flip`, specify `left_kp` and `right_kp` according to the keypoint symmetrical relationship, or remove the transform for asymmetric keypoints structure.
+- transform: In `GeneratePoseTarget`, specify `skeletons`, `left_limb`, `right_limb` if `with_limb` is `true`, and `left_kp`, `right_kp` if `with_kp` is `true`.
+
+For a custom format, you need to add a new graph layout into models and transforms, which defines the keypoints and their connection relationship.
+
+Take the coco dataset as an example, we define a layout named `coco` in `Graph`, and set its `inward` as followed, which includes all connections between nodes, each connection is a pair of nodes from far to near. The order of connections does not matter. Other settings about coco are to set the number of nodes to 17, and set node 0 as the center node.
+
+```python
+
+self.num_node = 17
+self.inward = [(15, 13), (13, 11), (16, 14), (14, 12), (11, 5),
+ (12, 6), (9, 7), (7, 5), (10, 8), (8, 6), (5, 0),
+ (6, 0), (1, 0), (3, 1), (2, 0), (4, 2)]
+self.center = 0
+```
+
+Similarly, we define the `pairs` in `JointToBone`, adding a bone of `(0, 0)` to align the number of bones to the nodes. The `pairs` of coco dataset is as followed, same as above mentioned, the order of pairs does not matter.
+
+```python
+
+self.pairs = ((0, 0), (1, 0), (2, 0), (3, 1), (4, 2), (5, 0),
+ (6, 0), (7, 5), (8, 6), (9, 7), (10, 8), (11, 0),
+ (12, 0), (13, 11), (14, 12), (15, 13), (16, 14))
+```
+
+For your custom format, just define the above setting as your graph structure, and specify in your config file as followed, we take `STGCN` as an example, assuming you already define a `custom_dataset` in `Graph` and `JointToBone`, and num_classes is n.
+
+```python
+
+model = dict(
+ type='RecognizerGCN',
+ backbone=dict(
+ type='STGCN', graph_cfg=dict(layout='custom_dataset', mode='stgcn_spatial')),
+ cls_head=dict(type='GCNHead', num_classes=n, in_channels=256))
+
+train_pipeline = [
+ ...
+ dict(type='GenSkeFeat', dataset='custom_dataset'),
+ ...]
+
+val_pipeline = [
+ ...
+ dict(type='GenSkeFeat', dataset='custom_dataset'),
+ ...]
+
+test_pipeline = [
+ ...
+ dict(type='GenSkeFeat', dataset='custom_dataset'),
+ ...]
+
+```
diff --git a/docs/en/advanced_guides/customize_logging.md b/docs/en/advanced_guides/customize_logging.md
new file mode 100644
index 0000000000..aabaad949f
--- /dev/null
+++ b/docs/en/advanced_guides/customize_logging.md
@@ -0,0 +1,163 @@
+# Customize Logging
+
+MMAction2 produces a lot of logs during the running process, such as loss, iteration time, learning rate, etc. In this section, we will introduce you how to output custom log. More details about the logging system, please refer to [MMEngine](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/logging.html).
+
+- [Customize Logging](#customize-logging)
+ - [Flexible Logging System](#flexible-logging-system)
+ - [Customize log](#customize-log)
+ - [Export the debug log](#export-the-debug-log)
+
+## Flexible Logging System
+
+MMAction2 configures the logging system by LogProcessor in [default_runtime](/configs/_base_/default_runtime.py) in default, which is equivalent to:
+
+```python
+log_processor = dict(type='LogProcessor', window_size=20, by_epoch=True)
+```
+
+Defaultly, LogProcessor catches all filed start with `loss` return by `model.forward`. For example in the following model, `loss1` and `loss2` will be logged automatically without additional configuration.
+
+```python
+from mmengine.model import BaseModel
+
+class ToyModel(BaseModel):
+ def __init__(self) -> None:
+ super().__init__()
+ self.linear = nn.Linear(1, 1)
+
+ def forward(self, img, label, mode):
+ feat = self.linear(img)
+ loss1 = (feat - label).pow(2)
+ loss2 = (feat - label).abs()
+ return dict(loss1=loss1, loss2=loss2)
+```
+
+The format of the output log is as followed:
+
+```
+08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][10/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0019 data_time: 0.0004 loss1: 0.8381 loss2: 0.9007 loss: 1.7388
+08/21 02:58:41 - mmengine - INFO - Epoch(train) [1][20/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0029 data_time: 0.0010 loss1: 0.1978 loss2: 0.4312 loss: 0.6290
+```
+
+LogProcessor will output the log in the following format:
+
+- The prefix of the log:
+ - epoch mode(`by_epoch=True`): `Epoch(train) [{current_epoch}/{current_iteration}]/{dataloader_length}`
+ - iteration mode(`by_epoch=False`): `Iter(train) [{current_iteration}/{max_iteration}]`
+- Learning rate (`lr`): The learning rate of the last iteration.
+- Time:
+ - `time`: The averaged time for inference of the last `window_size` iterations.
+ - `data_time`: The averaged time for loading data of the last `window_size` iterations.
+ - `eta`: The estimated time of arrival to finish the training.
+- Loss: The averaged loss output by model of the last `window_size` iterations.
+
+```{warning}
+log_processor outputs the epoch based log by default(`by_epoch=True`). To get an expected log matched with the `train_cfg`, we should set the same value for `by_epoch` in `train_cfg` and `log_processor`.
+```
+
+Based on the rules above, the code snippet will count the average value of the loss1 and the loss2 every 20 iterations. More types of statistical methods, please refer to [MMEngine.LogProcessor](mmengine.runner.LogProcessor).
+
+## Customize log
+
+The logging system could not only log the loss, lr, .etc but also collect and output the custom log. For example, if we want to statistic the intermediate loss:
+
+The `ToyModel` calculate `loss_tmp` in forward, but don't save it into the return dict.
+
+```python
+from mmengine.logging import MessageHub
+
+class ToyModel(BaseModel):
+
+ def __init__(self) -> None:
+ super().__init__()
+ self.linear = nn.Linear(1, 1)
+
+ def forward(self, img, label, mode):
+ feat = self.linear(img)
+ loss_tmp = (feat - label).abs()
+ loss = loss_tmp.pow(2)
+
+ message_hub = MessageHub.get_current_instance()
+ # update the intermediate `loss_tmp` in the message hub
+ message_hub.update_scalar('train/loss_tmp', loss_tmp.sum())
+ return dict(loss=loss)
+```
+
+Add the `loss_tmp` into the config:
+
+```python
+log_processor = dict(
+ type='LogProcessor',
+ window_size=20,
+ by_epoch=True,
+ custom_cfg=[
+ # statistic the loss_tmp with the averaged value
+ dict(
+ data_src='loss_tmp',
+ window_size=20,
+ method_name='mean')
+ ])
+```
+
+The `loss_tmp` will be added to the output log:
+
+```
+08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][10/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0026 data_time: 0.0008 loss_tmp: 0.0097 loss: 0.0000
+08/21 03:40:31 - mmengine - INFO - Epoch(train) [1][20/25] lr: 1.0000e-02 eta: 0:00:00 time: 0.0028 data_time: 0.0013 loss_tmp: 0.0065 loss: 0.0000
+```
+
+## Export the debug log
+
+To export the debug log to the `work_dir`, you can set log_level in config file as followed:
+
+```
+log_level='DEBUG'
+```
+
+```
+08/21 18:16:22 - mmengine - DEBUG - Get class `LocalVisBackend` from "vis_backend" registry in "mmengine"
+08/21 18:16:22 - mmengine - DEBUG - An `LocalVisBackend` instance is built from registry, its implementation can be found in mmengine.visualization.vis_backend
+08/21 18:16:22 - mmengine - DEBUG - Get class `RuntimeInfoHook` from "hook" registry in "mmengine"
+08/21 18:16:22 - mmengine - DEBUG - An `RuntimeInfoHook` instance is built from registry, its implementation can be found in mmengine.hooks.runtime_info_hook
+08/21 18:16:22 - mmengine - DEBUG - Get class `IterTimerHook` from "hook" registry in "mmengine"
+...
+```
+
+Besides, logs of different ranks will be saved in `debug` mode if you are training your model with the shared storage. The hierarchy of the log is as follows:
+
+```text
+./tmp
+├── tmp.log
+├── tmp_rank1.log
+├── tmp_rank2.log
+├── tmp_rank3.log
+├── tmp_rank4.log
+├── tmp_rank5.log
+├── tmp_rank6.log
+└── tmp_rank7.log
+...
+└── tmp_rank63.log
+```
+
+The log of Multiple machines with independent storage:
+
+```text
+# device: 0:
+work_dir/
+└── exp_name_logs
+ ├── exp_name.log
+ ├── exp_name_rank1.log
+ ├── exp_name_rank2.log
+ ├── exp_name_rank3.log
+ ...
+ └── exp_name_rank7.log
+
+# device: 7:
+work_dir/
+└── exp_name_logs
+ ├── exp_name_rank56.log
+ ├── exp_name_rank57.log
+ ├── exp_name_rank58.log
+ ...
+ └── exp_name_rank63.log
+```
diff --git a/docs/en/advanced_guides/customize_models.md b/docs/en/advanced_guides/customize_models.md
new file mode 100644
index 0000000000..3d8c0e1d4e
--- /dev/null
+++ b/docs/en/advanced_guides/customize_models.md
@@ -0,0 +1 @@
+# Customize Models
diff --git a/docs/en/advanced_guides/customize_optimizer.md b/docs/en/advanced_guides/customize_optimizer.md
new file mode 100644
index 0000000000..d862b9632c
--- /dev/null
+++ b/docs/en/advanced_guides/customize_optimizer.md
@@ -0,0 +1,340 @@
+# Customize Optimizer
+
+In this tutorial, we will introduce some methods about how to build the optimizer and learning rate scheduler for your tasks.
+
+- [Customize Optimizer](#customize-optimizer)
+ - [Build optimizers using optim_wrapper](#build-optimizers-using-optim_wrapper)
+ - [Use optimizers supported by PyTorch](#use-optimizers-supported-by-pytorch)
+ - [Parameter-wise finely configuration](#parameter-wise-finely-configuration)
+ - [Gradient clipping](#gradient-clipping)
+ - [Gradient accumulation](#gradient-accumulation)
+ - [Customize parameter schedules](#customize-parameter-schedules)
+ - [Customize learning rate schedules](#customize-learning-rate-schedules)
+ - [Customize momentum schedules](#customize-momentum-schedules)
+ - [Add new optimizers or constructors](#add-new-optimizers-or-constructors)
+ - [Add new optimizers](#add-new-optimizers)
+ - [1. Implement a new optimizer](#1-implement-a-new-optimizer)
+ - [2. Import the optimizer](#2-import-the-optimizer)
+ - [3. Specify the optimizer in the config file](#3-specify-the-optimizer-in-the-config-file)
+ - [Add new optimizer constructors](#add-new-optimizer-constructors)
+
+## Build optimizers using optim_wrapper
+
+We use the `optim_wrapper` field to configure the strategies of optimization, which includes choices of the optimizer, parameter-wise configurations, gradient clipping and accumulation. A simple example can be:
+
+```python
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer=dict(type='SGD', lr=0.0003, weight_decay=0.0001)
+)
+```
+
+In the above example, a SGD optimizer with learning rate 0.0003 and weight decay 0.0001 is built.
+
+### Use optimizers supported by PyTorch
+
+We support all the optimizers implemented by PyTorch. To use a different optimizer, just need to change the `optimizer` field of config files. For example, if you want to use `torch.optim.Adam`, the modification in the config file could be as the following.
+
+```python
+optim_wrapper = dict(
+ type='OptimWrapper',
+ optimizer = dict(
+ type='Adam',
+ lr=0.001,
+ betas=(0.9, 0.999),
+ eps=1e-08,
+ weight_decay=0,
+ amsgrad=False),
+)
+```
+
+First we need to change the value of `type` to the desired optimizer name supported in `torch.optim`. Next we add necessary arguments of this optimizer to the `optimizer` field. The above config will build the following optimizer:
+
+```python
+torch.optim.Adam(lr=0.001,
+ betas=(0.9, 0.999),
+ eps=1e-08,
+ weight_decay=0,
+ amsgrad=False)
+```
+
+### Parameter-wise finely configuration
+
+Some models may have parameter-specific settings for optimization, for example, no weight decay to the BatchNorm layers or using different learning rates for different network layers.
+To finely configure them, we can use the `paramwise_cfg` argument in `optim_wrapper`.
+
+- **Set different hyper-parameter multipliers for different types of parameters.**
+
+ For instance, we can set `norm_decay_mult=0.` in `paramwise_cfg` to change the weight decay of weight and bias of normalization layers to zero.
+
+ ```python
+ optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.8, weight_decay=1e-4),
+ paramwise_cfg=dict(norm_decay_mult=0.))
+ ```
+
+ More types of parameters are supported to configured, list as follow:
+
+ - `lr_mult`: Multiplier for learning rate of all parameters.
+ - `decay_mult`: Multiplier for weight decay of all parameters.
+ - `bias_lr_mult`: Multiplier for learning rate of bias (Not include normalization layers' biases and deformable convolution layers' offsets). Defaults to 1.
+ - `bias_decay_mult`: Multiplier for weight decay of bias (Not include normalization layers' biases and deformable convolution layers' offsets). Defaults to 1.
+ - `norm_decay_mult`: Multiplier for weight decay of weigh and bias of normalization layers. Defaults to 1.
+ - `dwconv_decay_mult`: Multiplier for weight decay of depth-wise convolution layers. Defaults to 1.
+ - `bypass_duplicate`: Whether to bypass duplicated parameters. Defaults to `False`.
+ - `dcn_offset_lr_mult`: Multiplier for learning rate of deformable convolution layers. Defaults to 1.
+
+- **Set different hyper-parameter multipliers for specific parameters.**
+
+ MMAction2 can use `custom_keys` in `paramwise_cfg` to specify different parameters to use different learning rates or weight decay.
+
+ For example, to set all learning rates and weight decays of `backbone.layer0` to 0, the rest of `backbone` remains the same as the optimizer and the learning rate of `head` to 0.001, use the configs below.
+
+ ```python
+ optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+ paramwise_cfg=dict(
+ custom_keys={
+ 'backbone.layer0': dict(lr_mult=0, decay_mult=0),
+ 'backbone': dict(lr_mult=1),
+ 'head': dict(lr_mult=0.1)
+ }))
+ ```
+
+### Gradient clipping
+
+During the training process, the loss function may get close to a cliffy region and cause gradient explosion. And gradient clipping is helpful to stabilize the training process. More introduction can be found in [this page](https://paperswithcode.com/method/gradient-clipping).
+
+Currently we support `clip_grad` option in `optim_wrapper` for gradient clipping, refers to [PyTorch Documentation](torch.nn.utils.clip_grad_norm_).
+
+Here is an example:
+
+```python
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+ # norm_type: type of the used p-norm, here norm_type is 2.
+ clip_grad=dict(max_norm=35, norm_type=2))
+```
+
+### Gradient accumulation
+
+When computing resources are lacking, the batch size can only be set to a small value, which may affect the performance of models. Gradient accumulation can be used to solve this problem. We support `accumulative_counts` option in `optim_wrapper` for gradient accumulation.
+
+Here is an example:
+
+```python
+train_dataloader = dict(batch_size=64)
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001),
+ accumulative_counts=4)
+```
+
+Indicates that during training, back-propagation is performed every 4 iters. And the above is equivalent to:
+
+```python
+train_dataloader = dict(batch_size=256)
+optim_wrapper = dict(
+ optimizer=dict(type='SGD', lr=0.01, weight_decay=0.0001))
+```
+
+## Customize parameter schedules
+
+In training, the optimzation parameters such as learing rate, momentum, are usually not fixed but changing through iterations or epochs. PyTorch supports several learning rate schedulers, which are not sufficient for complex strategies. In MMAction2, we provide `param_scheduler` for better controls of different parameter schedules.
+
+### Customize learning rate schedules
+
+Learning rate schedulers are widely used to improve performance. We support most of the PyTorch schedulers, including `ExponentialLR`, `LinearLR`, `StepLR`, `MultiStepLR`, etc.
+
+All available learning rate scheduler can be found {external+mmengine:ref}`here `, and the
+names of learning rate schedulers end with `LR`.
+
+- **Single learning rate schedule**
+
+ In most cases, we use only one learning rate schedule for simplicity. For instance, [`MultiStepLR`](mmengine.optim.MultiStepLR) is used as the default learning rate schedule for ResNet. Here, `param_scheduler` is a dictionary.
+
+ ```python
+ param_scheduler = dict(
+ type='MultiStepLR',
+ by_epoch=True,
+ milestones=[100, 150],
+ gamma=0.1)
+ ```
+
+ Or, we want to use the [`CosineAnnealingLR`](mmengine.optim.CosineAnnealingLR) scheduler to decay the learning rate:
+
+ ```python
+ param_scheduler = dict(
+ type='CosineAnnealingLR',
+ by_epoch=True,
+ T_max=num_epochs)
+ ```
+
+- **Multiple learning rate schedules**
+
+ In some of the training cases, multiple learning rate schedules are applied for higher accuracy. For example ,in the early stage, training is easy to be volatile, and warmup is a technique to reduce volatility.
+ The learning rate will increase gradually from a minor value to the expected value by warmup and decay afterwards by other schedules.
+
+ In MMAction2, simply combines desired schedules in `param_scheduler` as a list can achieve the warmup strategy.
+
+ Here are some examples:
+
+ 1. linear warmup during the first 50 iters.
+
+ ```python
+ param_scheduler = [
+ # linear warm-up by iters
+ dict(type='LinearLR',
+ start_factor=0.001,
+ by_epoch=False, # by iters
+ end=50), # only warm up for first 50 iters
+ # main learing rate schedule
+ dict(type='MultiStepLR',
+ by_epoch=True,
+ milestones=[8, 11],
+ gamma=0.1)
+ ]
+ ```
+
+ 2. linear warmup and update lr by iter during the first 10 epochs.
+
+ ```python
+ param_scheduler = [
+ # linear warm-up by epochs in [0, 10) epochs
+ dict(type='LinearLR',
+ start_factor=0.001,
+ by_epoch=True,
+ end=10,
+ convert_to_iter_based=True, # Update learning rate by iter.
+ ),
+ # use CosineAnnealing schedule after 10 epochs
+ dict(type='CosineAnnealingLR', by_epoch=True, begin=10)
+ ]
+ ```
+
+ Notice that, we use `begin` and `end` arguments here to assign the valid range, which is \[`begin`, `end`) for this schedule. And the range unit is defined by `by_epoch` argument. If not specified, the `begin` is 0 and the `end` is the max epochs or iterations.
+
+ If the ranges for all schedules are not continuous, the learning rate will stay constant in ignored range, otherwise all valid schedulers will be executed in order in a specific stage, which behaves the same as PyTorch [`ChainedScheduler`](torch.optim.lr_scheduler.ChainedScheduler).
+
+### Customize momentum schedules
+
+We support using momentum schedulers to modify the optimizer's momentum according to learning rate, which could make the loss converge in a faster way. The usage is the same as learning rate schedulers.
+
+All available learning rate scheduler can be found {external+mmengine:ref}`here `, and the
+names of momentum rate schedulers end with `Momentum`.
+
+Here is an example:
+
+```python
+param_scheduler = [
+ # the lr scheduler
+ dict(type='LinearLR', ...),
+ # the momentum scheduler
+ dict(type='LinearMomentum',
+ start_factor=0.001,
+ by_epoch=False,
+ begin=0,
+ end=1000)
+]
+```
+
+## Add new optimizers or constructors
+
+This part will modify the MMAction2 source code or add code to the MMAction2 framework, beginners can skip it.
+
+### Add new optimizers
+
+In academic research and industrial practice, it may be necessary to use optimization methods not implemented by MMAction2, and you can add them through the following methods.
+
+#### 1. Implement a new optimizer
+
+Assume you want to add an optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`.
+You need to create a new file under `mmaction/engine/optimizers`, and implement the new optimizer in the file, for example, in `mmaction/engine/optimizers/my_optimizer.py`:
+
+```python
+from torch.optim import Optimizer
+from mmaction.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class MyOptimizer(Optimizer):
+
+ def __init__(self, a, b, c):
+ ...
+
+ def step(self, closure=None):
+ ...
+```
+
+#### 2. Import the optimizer
+
+To find the above module defined above, this module should be imported during the running. First import it in the `mmaction/engine/optimizers/__init__.py` to add it into the `mmaction.engine` package.
+
+```python
+# In mmaction/engine/optimizers/__init__.py
+...
+from .my_optimizer import MyOptimizer # MyOptimizer maybe other class name
+
+__all__ = [..., 'MyOptimizer']
+```
+
+During running, we will automatically import the `mmaction.engine` package and register the `MyOptimizer` at the same time.
+
+#### 3. Specify the optimizer in the config file
+
+Then you can use `MyOptimizer` in the `optim_wrapper.optimizer` field of config files.
+
+```python
+optim_wrapper = dict(
+ optimizer=dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value))
+```
+
+### Add new optimizer constructors
+
+Some models may have some parameter-specific settings for optimization, like different weight decay rate for all `BatchNorm` layers.
+
+Although we already can use [the `optim_wrapper.paramwise_cfg` field](#parameter-wise-finely-configuration) to
+configure various parameter-specific optimizer settings. It may still not cover your need.
+
+Of course, you can modify it. By default, we use the [`DefaultOptimWrapperConstructor`](mmengine.optim.DefaultOptimWrapperConstructor)
+class to deal with the construction of optimizer. And during the construction, it fine-grainedly configures the optimizer settings of
+different parameters according to the `paramwise_cfg`,which could also serve as a template for new optimizer constructor.
+
+You can overwrite these behaviors by add new optimizer constructors.
+
+```python
+# In mmaction/engine/optimizers/my_optim_constructor.py
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmaction.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class MyOptimWrapperConstructor:
+
+ def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
+ ...
+
+ def __call__(self, model):
+ ...
+```
+
+And then, import it and use it almost like [the optimizer tutorial](#add-new-optimizers).
+
+1. Import it in the `mmaction/engine/optimizers/__init__.py` to add it into the `mmaction.engine` package.
+
+ ```python
+ # In mmaction/engine/optimizers/__init__.py
+ ...
+ from .my_optim_constructor import MyOptimWrapperConstructor
+
+ __all__ = [..., 'MyOptimWrapperConstructor']
+ ```
+
+2. Use `MyOptimWrapperConstructor` in the `optim_wrapper.constructor` field of config files.
+
+ ```python
+ optim_wrapper = dict(
+ constructor=dict(type='MyOptimWrapperConstructor'),
+ optimizer=...,
+ paramwise_cfg=...,
+ )
+ ```
diff --git a/docs/en/advanced_guides/customize_pipeline.md b/docs/en/advanced_guides/customize_pipeline.md
new file mode 100644
index 0000000000..632216ba10
--- /dev/null
+++ b/docs/en/advanced_guides/customize_pipeline.md
@@ -0,0 +1,155 @@
+# Customize Data Pipeline
+
+In this tutorial, we will introduce some methods about how to build the data pipeline (i.e., data transformations)for your tasks.
+
+- [Customize Data Pipeline](#customize-data-pipeline)
+ - [Design of Data pipelines](#design-of-data-pipelines)
+ - [Modify the training/test pipeline](#modify-the-trainingtest-pipeline)
+ - [Loading](#loading)
+ - [Sampling frames and other processing](#sampling-frames-and-other-processing)
+ - [Formatting](#formatting)
+ - [Add new data transforms](#add-new-data-transforms)
+
+## Design of Data pipelines
+
+The data pipeline means how to process the sample dict when indexing a sample from the dataset. And it
+consists of a sequence of data transforms. Each data transform takes a dict as input, processes it, and outputs a dict for the next data transform.
+
+Here is a data pipeline example for SlowFast training on Kinetics for `VideoDataset`. It first use [`decord`](https://github.com/dmlc/decord) to read the raw videos and randomly sample one video clip (the clip has 32 frames, and the interval between frames is 2). Next it applies the random resized crop and random horizontal flip to all frames. Finally the data shape is formatted as `NCTHW`.
+
+```python
+train_pipeline = [
+ dict(type='DecordInit',),
+ dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+ dict(type='DecordDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(type='RandomResizedCrop'),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+```
+
+All available data transforms in MMAction2 can be found in the [data transforms docs](mmaction.datasets.transforms).
+
+## Modify the training/test pipeline
+
+The data pipeline in MMAction2 is pretty flexible. You can control almost every step of the data
+preprocessing from the config file, but on the other hand, you may be confused facing so many options.
+
+Here is a common practice and guidance for action recognition tasks.
+
+### Loading
+
+At the beginning of a data pipeline, we usually need to load videos. But if you already extract the frames, you should use `RawFrameDecode` and change the dataset type to `RawframeDataset`:
+
+```python
+train_pipeline = [
+ dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
+ dict(type='RawFrameDecode'),
+ dict(type='Resize', scale=(-1, 256)),
+ dict(type='RandomResizedCrop'),
+ dict(type='Resize', scale=(224, 224), keep_ratio=False),
+ dict(type='Flip', flip_ratio=0.5),
+ dict(type='FormatShape', input_format='NCTHW'),
+ dict(type='PackActionInputs')
+]
+```
+
+If you want to load data from files with special formats or special locations, you can [implement a new loading
+transform](#add-new-data-transforms) and add it at the beginning of the data pipeline.
+
+### Sampling frames and other processing
+
+During training and testing, we may have different strategies to sample frames from the video.
+
+For example, during testing of SlowFast, we sample multiple clips uniformly:
+
+```python
+test_pipeline = [
+ ...
+ dict(
+ type='SampleFrames',
+ clip_len=32,
+ frame_interval=2,
+ num_clips=10,
+ test_mode=True),
+ ...
+]
+```
+
+In the above example, 10 clips of 32-frame video clips will be sampled for each video. We use `test_mode=True` to uniformly sample these clips (as opposed to randomly sample during training).
+
+Another example is that TSN/TSM models sample multiple segments from the video:
+
+```python
+train_pipeline = [
+ ...
+ dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
+ ...
+]
+```
+
+```{note}
+Usually, the data augmentation part in the data pipeline handles only video-wise transforms, but not transforms
+like video normalization or mixup/cutmix. It's because we can do image normalization and mixup/cutmix on batch data
+to accelerate with GPUs. To configure video normalization and mixup/cutmix, please use the [data preprocessor]
+(mmaction.models.utils.data_preprocessor).
+```
+
+### Formatting
+
+The formatting is to collect training data from the data information dict and convert these data to
+model-friendly format.
+
+In most cases, you can simply use [`PackActionInputs`](mmaction.datasets.transforms.PackActionInputs), and it will
+convert the image in NumPy array format to PyTorch tensor, and pack the ground truth categories information and
+other meta information as a dict-like object [`ActionDataSample`](mmaction.structures.ActionDataSample).
+
+```python
+train_pipeline = [
+ ...
+ dict(type='PackActionInputs'),
+]
+```
+
+## Add new data transforms
+
+1. Write a new data transform in any file, e.g., `my_transform.py`, and place it in
+ the folder `mmaction/datasets/transforms/`. The data transform class needs to inherit
+ the [`mmcv.transforms.BaseTransform`](mmcv.transforms.BaseTransform) class and override
+ the `transform` method which takes a dict as input and returns a dict.
+
+ ```python
+ from mmcv.transforms import BaseTransform
+ from mmaction.datasets import TRANSFORMS
+
+ @TRANSFORMS.register_module()
+ class MyTransform(BaseTransform):
+
+ def transform(self, results):
+ # Modify the data information dict `results`.
+ return results
+ ```
+
+2. Import the new class in the `mmaction/datasets/transforms/__init__.py`.
+
+ ```python
+ ...
+ from .my_transform import MyTransform
+
+ __all__ = [
+ ..., 'MyTransform'
+ ]
+ ```
+
+3. Use it in config files.
+
+ ```python
+ train_pipeline = [
+ ...
+ dict(type='MyTransform'),
+ ...
+ ]
+ ```
diff --git a/docs/en/advanced_guides/dataflow.md b/docs/en/advanced_guides/dataflow.md
new file mode 100644
index 0000000000..5723cc1557
--- /dev/null
+++ b/docs/en/advanced_guides/dataflow.md
@@ -0,0 +1,3 @@
+# Dataflow in MMAction2
+
+coming soon...
diff --git a/docs/en/advanced_guides/depoly.md b/docs/en/advanced_guides/depoly.md
new file mode 100644
index 0000000000..58e9f58ea4
--- /dev/null
+++ b/docs/en/advanced_guides/depoly.md
@@ -0,0 +1,3 @@
+# How to deploy MMAction2 models
+
+coming soon...
diff --git a/docs/en/api.rst b/docs/en/api.rst
new file mode 100644
index 0000000000..4431c7734b
--- /dev/null
+++ b/docs/en/api.rst
@@ -0,0 +1,140 @@
+mmaction.apis
+--------------
+.. automodule:: mmaction.apis
+ :members:
+
+mmaction.datasets
+--------------
+
+datasets
+^^^^^^^^^^
+.. automodule:: mmaction.datasets
+ :members:
+
+transforms
+^^^^^^^^^^^^
+.. automodule:: mmaction.datasets.transforms
+ :members:
+
+mmaction.engine
+--------------
+
+hooks
+^^^^^^^^^^
+.. automodule:: mmaction.engine.hooks
+ :members:
+
+optimizers
+^^^^^^^^^^^^^^^
+.. automodule:: mmaction.engine.optimizers
+ :members:
+
+runner
+^^^^^^^^^^
+.. automodule:: mmaction.engine.runner
+ :members:
+
+
+mmaction.evaluation
+--------------------
+
+functional
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.evaluation.functional
+ :members:
+
+metrics
+^^^^^^^^^^
+.. automodule:: mmaction.evaluation.metrics
+ :members:
+
+
+mmaction.models
+--------------
+
+backbones
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.backbones
+ :members:
+
+common
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.common
+ :members:
+
+data_preprocessors
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.data_preprocessors
+ :members:
+
+heads
+^^^^^^^^^^^^^^^
+.. automodule:: mmaction.models.heads
+ :members:
+
+localizers
+^^^^^^^^^^
+.. automodule:: mmaction.models.localizers
+ :members:
+
+
+losses
+^^^^^^^^^^
+.. automodule:: mmaction.models.losses
+ :members:
+
+necks
+^^^^^^^^^^^^
+.. automodule:: mmaction.models.necks
+ :members:
+
+roi_heads
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.roi_heads
+ :members:
+
+recognizers
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.seg_heads
+ :members:
+
+task_modules
+^^^^^^^^^^^^^
+.. automodule:: mmaction.models.task_modules
+ :members:
+
+
+utils
+^^^^^^^^^^
+.. automodule:: mmaction.models.utils
+ :members:
+
+
+mmaction.structures
+--------------------
+
+structures
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmaction.structures
+ :members:
+
+bbox
+^^^^^^^^^^
+.. automodule:: mmaction.structures.bbox
+ :members:
+
+
+mmaction.testing
+----------------
+.. automodule:: mmaction.testing
+ :members:
+
+mmaction.visualization
+--------------------
+.. automodule:: mmaction.visualization
+ :members:
+
+mmaction.utils
+--------------
+.. automodule:: mmaction.utils
+ :members:
diff --git a/docs/en/conf.py b/docs/en/conf.py
index 6ff7f10029..6623d99b45 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -17,7 +17,7 @@
import pytorch_sphinx_theme
-sys.path.insert(0, os.path.abspath('..'))
+sys.path.insert(0, os.path.abspath('../..'))
# -- Project information -----------------------------------------------------
@@ -124,6 +124,7 @@ def get_version():
html_css_files = ['css/readthedocs.css']
myst_enable_extensions = ['colon_fence']
+myst_heading_anchors = 3
def builder_inited_handler(app):
diff --git a/docs/en/notes/contribution_guide.md b/docs/en/get_started/contribution_guide.md
similarity index 81%
rename from docs/en/notes/contribution_guide.md
rename to docs/en/get_started/contribution_guide.md
index 92548868d2..02f2aa35d4 100644
--- a/docs/en/notes/contribution_guide.md
+++ b/docs/en/get_started/contribution_guide.md
@@ -1,10 +1,11 @@
-# Contributing to MMAction2
+# How to contribute to MMAction2
All kinds of contributions are welcome, including but not limited to the following.
- Fixes (typo, bugs)
- New features and components
- Add documentation or translate the documentation into other languages
+- Add new project (Recommended) about video understanding algorithm with less restriction, refer to [here](/projects/README.md) for details
## Workflow
@@ -33,10 +34,11 @@ We use the following tools for linting and formatting:
- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
-Style configurations of yapf and isort can be found in [setup.cfg](../../../setup.cfg).
+Style configurations of yapf and isort can be found in [setup.cfg](https://github.com/open-mmlab/mmaction2/blob/1.x/setup.cfg).
-We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, fixes `end-of-files`, sorts `requirments.txt` automatically on every commit.
-The config for a pre-commit hook is stored in [.pre-commit-config](../../../.pre-commit-config.yaml).
+We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
+fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
+The config for a pre-commit hook is stored in [.pre-commit-config](https://github.com/open-mmlab/mmaction2/blob/1.x/.pre-commit-config.yaml).
After you clone the repository, you will need to install initialize pre-commit hook.
diff --git a/docs/en/notes/faq.md b/docs/en/get_started/faq.md
similarity index 98%
rename from docs/en/notes/faq.md
rename to docs/en/get_started/faq.md
index 4f028d5b4c..7ef9cdd53e 100644
--- a/docs/en/notes/faq.md
+++ b/docs/en/get_started/faq.md
@@ -30,7 +30,7 @@ If the contents here do not cover your issue, please create an issue using the [
- **"Why I got the error message 'Please install XXCODEBASE to use XXX' even if I have already installed XXCODEBASE?"**
- You got that error message because our project failed to import a function or a class from XXCODEBASE. You can try to run the corresponding line to see what happens. One possible reason is, for some codebases in OpenMMLAB, you need to install mmcv and mmengine before you install them. You could follow this [tutorial](https://mmaction2.readthedocs.io/en/1.x/get_started.html#installation) to install them.
+ You got that error message because our project failed to import a function or a class from XXCODEBASE. You can try to run the corresponding line to see what happens. One possible reason is, for some codebases in OpenMMLAB, you need to install mmcv and mmengine before you install them. You could follow this [tutorial](https://mmaction2.readthedocs.io/en/latest/get_started.html#installation) to install them.
## Data
@@ -88,7 +88,7 @@ If the contents here do not cover your issue, please create an issue using the [
- **How to set `load_from` value in config files to finetune models?**
- In MMAction2, We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](/docs/en/user_guides/1_config.md),
+ In MMAction2, We set `load_from=None` as default in `configs/_base_/default_runtime.py` and owing to [inheritance design](/docs/en/user_guides/config.md),
users can directly change it by setting `load_from` in their configs.
## Testing
diff --git a/docs/en/get_started/guide_to_framework.md b/docs/en/get_started/guide_to_framework.md
new file mode 100644
index 0000000000..ab66ba196f
--- /dev/null
+++ b/docs/en/get_started/guide_to_framework.md
@@ -0,0 +1,760 @@
+# A 20-Minute Guide to MMAction2 FrameWork
+
+In this tutorial, we will demonstrate the overall architecture of our `MMACTION2 1.0` through a step-by-step example of video action recognition.
+
+The structure of this tutorial is as follows:
+
+- [A 20-Minute Guide to MMAction2 FrameWork](#a-20-minute-guide-to-mmaction2-framework)
+ - [Step0: Prepare Data](#step0-prepare-data)
+ - [Step1: Build a Pipeline](#step1-build-a-pipeline)
+ - [Step2: Build a Dataset and DataLoader](#step2-build-a-dataset-and-dataloader)
+ - [Step3: Build a Recognizer](#step3-build-a-recognizer)
+ - [Step4: Build a Evaluation Metric](#step4-build-a-evaluation-metric)
+ - [Step5: Train and Test with Native PyTorch](#step5-train-and-test-with-native-pytorch)
+ - [Step6: Train and Test with MMEngine (Recommended)](#step6-train-and-test-with-mmengine-recommended)
+
+First, we need to initialize the `scope` for registry, to ensure that each module is registered under the scope of `mmaction`. For more detailed information about registry, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html).
+
+```python
+from mmaction.utils import register_all_modules
+
+register_all_modules(init_default_scope=True)
+```
+
+## Step0: Prepare Data
+
+Please download our self-made [kinetics400_tiny](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) dataset and extract it to the `$MMACTION2/data` directory.
+The directory structure after extraction should be as follows:
+
+```
+mmaction2
+├── data
+│ ├── kinetics400_tiny
+│ │ ├── kinetics_tiny_train_video.txt
+│ │ ├── kinetics_tiny_val_video.txt
+│ │ ├── train
+│ │ │ ├── 27_CSXByd3s.mp4
+│ │ │ ├── 34XczvTaRiI.mp4
+│ │ │ ├── A-wiliK50Zw.mp4
+│ │ │ ├── ...
+│ │ └── val
+│ │ ├── 0pVGiAU6XEA.mp4
+│ │ ├── AQrbRSnRt8M.mp4
+│ │ ├── ...
+```
+
+Here are some examples from the annotation file `kinetics_tiny_train_video.txt`:
+
+```
+D32_1gwq35E.mp4 0
+iRuyZSKhHRg.mp4 1
+oXy-e_P_cAI.mp4 0
+34XczvTaRiI.mp4 1
+h2YqqUhnR34.mp4 0
+```
+
+Each line in the file represents the annotation of a video, where the first item denotes the video filename (e.g., `D32_1gwq35E.mp4`), and the second item represents the corresponding label (e.g., label `0` for `D32_1gwq35E.mp4`). In this dataset, there are only `two` categories.
+
+## Step1: Build a Pipeline
+
+In order to `decode`, `sample`, `resize`, `crop`, `format`, and `pack` the input video and corresponding annotation, we need to design a pipeline to handle these processes. Specifically, we design seven `Transform` classes to build this video processing pipeline. Note that all `Transform` classes in OpenMMLab must inherit from the `BaseTransform` class in `mmcv`, implement the abstract method `transform`, and be registered to the `TRANSFORMS` registry. For more detailed information about data transform, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_transform.html).
+
+```python
+import mmcv
+import decord
+import numpy as np
+from mmcv.transforms import TRANSFORMS, BaseTransform, to_tensor
+from mmaction.structures import ActionDataSample
+
+
+@TRANSFORMS.register_module()
+class VideoInit(BaseTransform):
+ def transform(self, results):
+ container = decord.VideoReader(results['filename'])
+ results['total_frames'] = len(container)
+ results['video_reader'] = container
+ return results
+
+
+@TRANSFORMS.register_module()
+class VideoSample(BaseTransform):
+ def __init__(self, clip_len, num_clips, test_mode=False):
+ self.clip_len = clip_len
+ self.num_clips = num_clips
+ self.test_mode = test_mode
+
+ def transform(self, results):
+ total_frames = results['total_frames']
+ interval = total_frames // self.clip_len
+
+ if self.test_mode:
+ # Make the sampling during testing deterministic
+ np.random.seed(42)
+
+ inds_of_all_clips = []
+ for i in range(self.num_clips):
+ bids = np.arange(self.clip_len) * interval
+ offset = np.random.randint(interval, size=bids.shape)
+ inds = bids + offset
+ inds_of_all_clips.append(inds)
+
+ results['frame_inds'] = np.concatenate(inds_of_all_clips)
+ results['clip_len'] = self.clip_len
+ results['num_clips'] = self.num_clips
+ return results
+
+
+@TRANSFORMS.register_module()
+class VideoDecode(BaseTransform):
+ def transform(self, results):
+ frame_inds = results['frame_inds']
+ container = results['video_reader']
+
+ imgs = container.get_batch(frame_inds).asnumpy()
+ imgs = list(imgs)
+
+ results['video_reader'] = None
+ del container
+
+ results['imgs'] = imgs
+ results['img_shape'] = imgs[0].shape[:2]
+ return results
+
+
+@TRANSFORMS.register_module()
+class VideoResize(BaseTransform):
+ def __init__(self, r_size):
+ self.r_size = (np.inf, r_size)
+
+ def transform(self, results):
+ img_h, img_w = results['img_shape']
+ new_w, new_h = mmcv.rescale_size((img_w, img_h), self.r_size)
+
+ imgs = [mmcv.imresize(img, (new_w, new_h))
+ for img in results['imgs']]
+ results['imgs'] = imgs
+ results['img_shape'] = imgs[0].shape[:2]
+ return results
+
+
+@TRANSFORMS.register_module()
+class VideoCrop(BaseTransform):
+ def __init__(self, c_size):
+ self.c_size = c_size
+
+ def transform(self, results):
+ img_h, img_w = results['img_shape']
+ center_x, center_y = img_w // 2, img_h // 2
+ x1, x2 = center_x - self.c_size // 2, center_x + self.c_size // 2
+ y1, y2 = center_y - self.c_size // 2, center_y + self.c_size // 2
+ imgs = [img[y1:y2, x1:x2] for img in results['imgs']]
+ results['imgs'] = imgs
+ results['img_shape'] = imgs[0].shape[:2]
+ return results
+
+
+@TRANSFORMS.register_module()
+class VideoFormat(BaseTransform):
+ def transform(self, results):
+ num_clips = results['num_clips']
+ clip_len = results['clip_len']
+ imgs = results['imgs']
+
+ # [num_clips*clip_len, H, W, C]
+ imgs = np.array(imgs)
+ # [num_clips, clip_len, H, W, C]
+ imgs = imgs.reshape((num_clips, clip_len) + imgs.shape[1:])
+ # [num_clips, C, clip_len, H, W]
+ imgs = imgs.transpose(0, 4, 1, 2, 3)
+
+ results['imgs'] = imgs
+ return results
+
+
+@TRANSFORMS.register_module()
+class VideoPack(BaseTransform):
+ def __init__(self, meta_keys=('img_shape', 'num_clips', 'clip_len')):
+ self.meta_keys = meta_keys
+
+ def transform(self, results):
+ packed_results = dict()
+ inputs = to_tensor(results['imgs'])
+ data_sample = ActionDataSample().set_gt_labels(results['label'])
+ metainfo = {k: results[k] for k in self.meta_keys if k in results}
+ data_sample.set_metainfo(metainfo)
+ packed_results['inputs'] = inputs
+ packed_results['data_samples'] = data_sample
+ return packed_results
+```
+
+Below, we provide a code snippet (using `D32_1gwq35E.mp4 0` from the annotation file) to demonstrate how to use the pipeline.
+
+```python
+import os.path as osp
+from mmengine.dataset import Compose
+
+pipeline_cfg = [
+ dict(type='VideoInit'),
+ dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
+ dict(type='VideoDecode'),
+ dict(type='VideoResize', r_size=256),
+ dict(type='VideoCrop', c_size=224),
+ dict(type='VideoFormat'),
+ dict(type='VideoPack')
+]
+
+pipeline = Compose(pipeline_cfg)
+data_prefix = 'data/kinetics400_tiny/train'
+results = dict(filename=osp.join(data_prefix, 'D32_1gwq35E.mp4'), label=0)
+packed_results = pipeline(results)
+
+inputs = packed_results['inputs']
+data_sample = packed_results['data_samples']
+
+print('shape of the inputs: ', inputs.shape)
+
+# Get metainfo of the inputs
+print('image_shape: ', data_sample.img_shape)
+print('num_clips: ', data_sample.num_clips)
+print('clip_len: ', data_sample.clip_len)
+
+# Get label of the inputs
+print('label: ', data_sample.gt_labels.item)
+```
+
+```
+shape of the inputs: torch.Size([1, 3, 16, 224, 224])
+image_shape: (224, 224)
+num_clips: 1
+clip_len: 16
+label: tensor([0])
+```
+
+## Step2: Build a Dataset and DataLoader
+
+All `Dataset` classes in OpenMMLab must inherit from the `BaseDataset` class in `mmengine`. We can customize annotation loading process by overriding the `load_data_list` method. Additionally, we can add more information to the `results` dict that is passed as input to the `pipeline` by overriding the `get_data_info` method. For more detailed information about `BaseDataset` class, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html).
+
+```python
+import os.path as osp
+from mmengine.fileio import list_from_file
+from mmengine.dataset import BaseDataset
+from mmaction.registry import DATASETS
+
+
+@DATASETS.register_module()
+class DatasetZelda(BaseDataset):
+ def __init__(self, ann_file, pipeline, data_root, data_prefix=dict(video=''),
+ test_mode=False, modality='RGB', **kwargs):
+ self.modality = modality
+ super(DatasetZelda, self).__init__(ann_file=ann_file, pipeline=pipeline, data_root=data_root,
+ data_prefix=data_prefix, test_mode=test_mode,
+ **kwargs)
+
+ def load_data_list(self):
+ data_list = []
+ fin = list_from_file(self.ann_file)
+ for line in fin:
+ line_split = line.strip().split()
+ filename, label = line_split
+ label = int(label)
+ filename = osp.join(self.data_prefix['video'], filename)
+ data_list.append(dict(filename=filename, label=label))
+ return data_list
+
+ def get_data_info(self, idx: int) -> dict:
+ data_info = super().get_data_info(idx)
+ data_info['modality'] = self.modality
+ return data_info
+```
+
+Next, we will demonstrate how to use dataset and dataloader to index data. We will use the `Runner.build_dataloader` method to construct the dataloader. For more detailed information about dataloader, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/dataset.html#details-on-dataloader).
+
+```python
+from mmaction.registry import DATASETS
+
+train_pipeline_cfg = [
+ dict(type='VideoInit'),
+ dict(type='VideoSample', clip_len=16, num_clips=1, test_mode=False),
+ dict(type='VideoDecode'),
+ dict(type='VideoResize', r_size=256),
+ dict(type='VideoCrop', c_size=224),
+ dict(type='VideoFormat'),
+ dict(type='VideoPack')
+]
+
+val_pipeline_cfg = [
+ dict(type='VideoInit'),
+ dict(type='VideoSample', clip_len=16, num_clips=5, test_mode=True),
+ dict(type='VideoDecode'),
+ dict(type='VideoResize', r_size=256),
+ dict(type='VideoCrop', c_size=224),
+ dict(type='VideoFormat'),
+ dict(type='VideoPack')
+]
+
+train_dataset_cfg = dict(
+ type='DatasetZelda',
+ ann_file='kinetics_tiny_train_video.txt',
+ pipeline=train_pipeline_cfg,
+ data_root='data/kinetics400_tiny/',
+ data_prefix=dict(video='train'))
+
+val_dataset_cfg = dict(
+ type='DatasetZelda',
+ ann_file='kinetics_tiny_val_video.txt',
+ pipeline=val_pipeline_cfg,
+ data_root='data/kinetics400_tiny/',
+ data_prefix=dict(video='val'))
+
+train_dataset = DATASETS.build(train_dataset_cfg)
+
+packed_results = train_dataset[0]
+
+inputs = packed_results['inputs']
+data_sample = packed_results['data_samples']
+
+print('shape of the inputs: ', inputs.shape)
+
+# Get metainfo of the inputs
+print('image_shape: ', data_sample.img_shape)
+print('num_clips: ', data_sample.num_clips)
+print('clip_len: ', data_sample.clip_len)
+
+# Get label of the inputs
+print('label: ', data_sample.gt_labels.item)
+
+from mmengine.runner import Runner
+
+BATCH_SIZE = 2
+
+train_dataloader_cfg = dict(
+ batch_size=BATCH_SIZE,
+ num_workers=0,
+ persistent_workers=False,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=train_dataset_cfg)
+
+val_dataloader_cfg = dict(
+ batch_size=BATCH_SIZE,
+ num_workers=0,
+ persistent_workers=False,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=val_dataset_cfg)
+
+train_data_loader = Runner.build_dataloader(dataloader=train_dataloader_cfg)
+val_data_loader = Runner.build_dataloader(dataloader=val_dataloader_cfg)
+
+batched_packed_results = next(iter(train_data_loader))
+
+batched_inputs = batched_packed_results['inputs']
+batched_data_sample = batched_packed_results['data_samples']
+
+assert len(batched_inputs) == BATCH_SIZE
+assert len(batched_data_sample) == BATCH_SIZE
+```
+
+The terminal output should be the same as the one shown in the [Step1: Build a Pipeline](#step1-build-a-pipeline).
+
+## Step3: Build a Recognizer
+
+Next, we will construct the `recognizer`, which mainly consists of three parts: `data preprocessor` for batching and normalizing the data, `backbone` for feature extraction, and `cls_head` for classification.
+
+The implementation of `data_preprocessor` is as follows:
+
+```python
+import torch
+from mmengine.model import BaseDataPreprocessor, stack_batch
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class DataPreprocessorZelda(BaseDataPreprocessor):
+ def __init__(self, mean, std):
+ super().__init__()
+
+ self.register_buffer(
+ 'mean',
+ torch.tensor(mean, dtype=torch.float32).view(-1, 1, 1, 1),
+ False)
+ self.register_buffer(
+ 'std',
+ torch.tensor(std, dtype=torch.float32).view(-1, 1, 1, 1),
+ False)
+
+ def forward(self, data, training=False):
+ data = self.cast_data(data)
+ inputs = data['inputs']
+ batch_inputs = stack_batch(inputs) # Batching
+ batch_inputs = (batch_inputs - self.mean) / self.std # Normalization
+ data['inputs'] = batch_inputs
+ return data
+```
+
+Here is the usage of data_preprocessor: feed the `batched_packed_results` obtained from the [Step2: Build a Dataset and DataLoader](#step2-build-a-dataset-and-dataloader) into the `data_preprocessor` for batching and normalization.
+
+```python
+from mmaction.registry import MODELS
+
+data_preprocessor_cfg = dict(
+ type='DataPreprocessorZelda',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375])
+
+data_preprocessor = MODELS.build(data_preprocessor_cfg)
+
+preprocessed_inputs = data_preprocessor(batched_packed_results)
+print(preprocessed_inputs['inputs'].shape)
+```
+
+```
+torch.Size([2, 1, 3, 16, 224, 224])
+```
+
+The implementations of `backbone`, `cls_head` and `recognizer` are as follows:
+
+```python
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModel, BaseModule, Sequential
+from mmengine.structures import LabelData
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class BackBoneZelda(BaseModule):
+ def __init__(self, init_cfg=None):
+ if init_cfg is None:
+ init_cfg = [dict(type='Kaiming', layer='Conv3d', mode='fan_out', nonlinearity="relu"),
+ dict(type='Constant', layer='BatchNorm3d', val=1, bias=0)]
+
+ super(BackBoneZelda, self).__init__(init_cfg=init_cfg)
+
+ self.conv1 = Sequential(nn.Conv3d(3, 64, kernel_size=(3, 7, 7),
+ stride=(1, 2, 2), padding=(1, 3, 3)),
+ nn.BatchNorm3d(64), nn.ReLU())
+ self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2),
+ padding=(0, 1, 1))
+
+ self.conv = Sequential(nn.Conv3d(64, 128, kernel_size=3, stride=2, padding=1),
+ nn.BatchNorm3d(128), nn.ReLU())
+
+ def forward(self, imgs):
+ # imgs: [batch_size*num_views, 3, T, H, W]
+ # features: [batch_size*num_views, 128, T/2, H//8, W//8]
+ features = self.conv(self.maxpool(self.conv1(imgs)))
+ return features
+
+
+@MODELS.register_module()
+class ClsHeadZelda(BaseModule):
+ def __init__(self, num_classes, in_channels, dropout=0.5, average_clips='prob', init_cfg=None):
+ if init_cfg is None:
+ init_cfg = dict(type='Normal', layer='Linear', std=0.01)
+
+ super(ClsHeadZelda, self).__init__(init_cfg=init_cfg)
+
+ self.num_classes = num_classes
+ self.in_channels = in_channels
+ self.average_clips = average_clips
+
+ if dropout != 0:
+ self.dropout = nn.Dropout(dropout)
+ else:
+ self.dropout = None
+
+ self.fc = nn.Linear(self.in_channels, self.num_classes)
+ self.pool = nn.AdaptiveAvgPool3d(1)
+ self.loss_fn = nn.CrossEntropyLoss()
+
+ def forward(self, x):
+ N, C, T, H, W = x.shape
+ x = self.pool(x)
+ x = x.view(N, C)
+ assert x.shape[1] == self.in_channels
+
+ if self.dropout is not None:
+ x = self.dropout(x)
+
+ cls_scores = self.fc(x)
+ return cls_scores
+
+ def loss(self, feats, data_samples):
+ cls_scores = self(feats)
+ labels = torch.stack([x.gt_labels.item for x in data_samples])
+ labels = labels.squeeze()
+
+ if labels.shape == torch.Size([]):
+ labels = labels.unsqueeze(0)
+
+ loss_cls = self.loss_fn(cls_scores, labels)
+ return dict(loss_cls=loss_cls)
+
+ def predict(self, feats, data_samples):
+ cls_scores = self(feats)
+ num_views = cls_scores.shape[0] // len(data_samples)
+ # assert num_views == data_samples[0].num_clips
+ cls_scores = self.average_clip(cls_scores, num_views)
+
+ for ds, sc in zip(data_samples, cls_scores):
+ pred = LabelData(item=sc)
+ ds.pred_scores = pred
+ return data_samples
+
+ def average_clip(self, cls_scores, num_views):
+ if self.average_clips not in ['score', 'prob', None]:
+ raise ValueError(f'{self.average_clips} is not supported. '
+ f'Currently supported ones are '
+ f'["score", "prob", None]')
+
+ total_views = cls_scores.shape[0]
+ cls_scores = cls_scores.view(total_views // num_views, num_views, -1)
+
+ if self.average_clips is None:
+ return cls_scores
+ elif self.average_clips == 'prob':
+ cls_scores = F.softmax(cls_scores, dim=2).mean(dim=1)
+ elif self.average_clips == 'score':
+ cls_scores = cls_scores.mean(dim=1)
+
+ return cls_scores
+
+
+@MODELS.register_module()
+class RecognizerZelda(BaseModel):
+ def __init__(self, backbone, cls_head, data_preprocessor):
+ super().__init__(data_preprocessor=data_preprocessor)
+
+ self.backbone = MODELS.build(backbone)
+ self.cls_head = MODELS.build(cls_head)
+
+ def extract_feat(self, inputs):
+ inputs = inputs.view((-1, ) + inputs.shape[2:])
+ return self.backbone(inputs)
+
+ def loss(self, inputs, data_samples):
+ feats = self.extract_feat(inputs)
+ loss = self.cls_head.loss(feats, data_samples)
+ return loss
+
+ def predict(self, inputs, data_samples):
+ feats = self.extract_feat(inputs)
+ predictions = self.cls_head.predict(feats, data_samples)
+ return predictions
+
+ def forward(self, inputs, data_samples=None, mode='tensor'):
+ if mode == 'tensor':
+ return self.extract_feat(inputs)
+ elif mode == 'loss':
+ return self.loss(inputs, data_samples)
+ elif mode == 'predict':
+ return self.predict(inputs, data_samples)
+ else:
+ raise RuntimeError(f'Invalid mode: {mode}')
+```
+
+The `init_cfg` is used for model weight initialization. For more information on model weight initialization, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/initialize.html). The usage of the above modules is as follows:
+
+```python
+import torch
+import copy
+from mmaction.registry import MODELS
+
+model_cfg = dict(
+ type='RecognizerZelda',
+ backbone=dict(type='BackBoneZelda'),
+ cls_head=dict(
+ type='ClsHeadZelda',
+ num_classes=2,
+ in_channels=128,
+ average_clips='prob'),
+ data_preprocessor = dict(
+ type='DataPreprocessorZelda',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375]))
+
+model = MODELS.build(model_cfg)
+
+# Train
+model.train()
+model.init_weights()
+data_batch_train = copy.deepcopy(batched_packed_results)
+data = model.data_preprocessor(data_batch_train, training=True)
+loss = model(**data, mode='loss')
+print('loss dict: ', loss)
+
+# Test
+with torch.no_grad():
+ model.eval()
+ data_batch_test = copy.deepcopy(batched_packed_results)
+ data = model.data_preprocessor(data_batch_test, training=False)
+ predictions = model(**data, mode='predict')
+print('Label of Sample[0]', predictions[0].gt_labels.item)
+print('Scores of Sample[0]', predictions[0].pred_scores.item)
+```
+
+```shell
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.0.weight - torch.Size([64, 3, 3, 7, 7]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.0.bias - torch.Size([64]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.1.weight - torch.Size([64]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv1.1.bias - torch.Size([64]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.0.weight - torch.Size([128, 64, 3, 3, 3]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.0.bias - torch.Size([128]):
+KaimingInit: a=0, mode=fan_out, nonlinearity=relu, distribution =normal, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.1.weight - torch.Size([128]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+backbone.conv.1.bias - torch.Size([128]):
+The value is the same before and after calling `init_weights` of RecognizerZelda
+
+04/03 23:28:01 - mmengine - INFO -
+cls_head.fc.weight - torch.Size([2, 128]):
+NormalInit: mean=0, std=0.01, bias=0
+
+04/03 23:28:01 - mmengine - INFO -
+cls_head.fc.bias - torch.Size([2]):
+NormalInit: mean=0, std=0.01, bias=0
+
+loss dict: {'loss_cls': tensor(0.6853, grad_fn=)}
+Label of Sample[0] tensor([0])
+Scores of Sample[0] tensor([0.5240, 0.4760])
+```
+
+## Step4: Build a Evaluation Metric
+
+Note that all `Metric` classes in `OpenMMLab` must inherit from the `BaseMetric` class in `mmengine` and implement the abstract methods, `process` and `compute_metrics`. For more information on evaluation, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html).
+
+```python
+import copy
+from collections import OrderedDict
+from mmengine.evaluator import BaseMetric
+from mmaction.evaluation import top_k_accuracy
+from mmaction.registry import METRICS
+
+
+@METRICS.register_module()
+class AccuracyMetric(BaseMetric):
+ def __init__(self, topk=(1, 5), collect_device='cpu', prefix='acc'):
+ super().__init__(collect_device=collect_device, prefix=prefix)
+ self.topk = topk
+
+ def process(self, data_batch, data_samples):
+ data_samples = copy.deepcopy(data_samples)
+ for data_sample in data_samples:
+ result = dict()
+ scores = data_sample['pred_scores']['item'].cpu().numpy()
+ label = data_sample['gt_labels']['item'].item()
+ result['scores'] = scores
+ result['label'] = label
+ self.results.append(result)
+
+ def compute_metrics(self, results: list) -> dict:
+ eval_results = OrderedDict()
+ labels = [res['label'] for res in results]
+ scores = [res['scores'] for res in results]
+ topk_acc = top_k_accuracy(scores, labels, self.topk)
+ for k, acc in zip(self.topk, topk_acc):
+ eval_results[f'topk{k}'] = acc
+ return eval_results
+```
+
+```python
+from mmaction.registry import METRICS
+
+metric_cfg = dict(type='AccuracyMetric', topk=(1, 5))
+
+metric = METRICS.build(metric_cfg)
+
+data_samples = [d.to_dict() for d in predictions]
+
+metric.process(batched_packed_results, data_samples)
+acc = metric.compute_metrics(metric.results)
+print(acc)
+```
+
+```shell
+OrderedDict([('topk1', 0.5), ('topk5', 1.0)])
+```
+
+## Step5: Train and Test with Native PyTorch
+
+```python
+import torch.optim as optim
+from mmengine import track_iter_progress
+
+
+device = 'cuda' # or 'cpu'
+max_epochs = 10
+
+optimizer = optim.Adam(model.parameters(), lr=0.01)
+
+for epoch in range(max_epochs):
+ model.train()
+ losses = []
+ for data_batch in track_iter_progress(train_data_loader):
+ data = model.data_preprocessor(data_batch, training=True)
+ loss_dict = model(**data, mode='loss')
+ loss = loss_dict['loss_cls']
+
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+
+ losses.append(loss.item())
+
+ print(f'Epoch[{epoch}]: loss ', sum(losses) / len(train_data_loader))
+
+ with torch.no_grad():
+ model.eval()
+ for data_batch in track_iter_progress(val_data_loader):
+ data = model.data_preprocessor(data_batch, training=False)
+ predictions = model(**data, mode='predict')
+ data_samples = [d.to_dict() for d in predictions]
+ metric.process(data_batch, data_samples)
+
+ acc = metric.acc = metric.compute_metrics(metric.results)
+ for name, topk in acc.items():
+ print(f'{name}: ', topk)
+```
+
+## Step6: Train and Test with MMEngine (Recommended)
+
+For more details on training and testing, you can refer to [MMAction2 Tutorial](https://mmaction2.readthedocs.io/en/latest/user_guides/4_train_test.html). For more information on `Runner`, please refer to [MMEngine Tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html).
+
+```python
+from mmengine.runner import Runner
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1)
+val_cfg = dict(type='ValLoop')
+
+optim_wrapper = dict(optimizer=dict(type='Adam', lr=0.01))
+
+runner = Runner(model=model_cfg, work_dir='./work_dirs/guide',
+ train_dataloader=train_dataloader_cfg,
+ train_cfg=train_cfg,
+ val_dataloader=val_dataloader_cfg,
+ val_cfg=val_cfg,
+ optim_wrapper=optim_wrapper,
+ val_evaluator=[metric_cfg],
+ default_scope='mmaction')
+runner.train()
+```
diff --git a/docs/en/get_started.md b/docs/en/get_started/installation.md
similarity index 93%
rename from docs/en/get_started.md
rename to docs/en/get_started/installation.md
index 0f0ac1c5ec..294bf17443 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started/installation.md
@@ -1,4 +1,6 @@
-# Prerequisites
+# Installation
+
+## Prerequisites
In this section we demonstrate how to prepare an environment with PyTorch.
@@ -35,27 +37,25 @@ On CPU platforms:
conda install pytorch torchvision cpuonly -c pytorch
```
-# Installation
+## Best Practices
We recommend that users follow our best practices to install MMAction2. However, the whole process is highly customizable. See [Customize Installation](#customize-installation) section for more information.
-## Best Practices
-
**Step 1.** Install [MMEngine](https://github.com/open-mmlab/mmengine) and [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim).
```shell
pip install -U openmim
-mim install mmengine 'mmcv>=2.0.0rc1'
+mim install mmengine 'mmcv>=2.0.0'
```
**Step 2.** Install MMAction2.
According to your needs, we support two install modes:
-- [Install from source (Recommended)](#install-from-source): You want to develop your own action recognition task or new features on MMAction2 framework. For example, adding new dataset or new models. Thus, you can use all tools we provided.
+- [Install from source (Recommended)](#build-mmaction2-from-source): You want to develop your own action recognition task or new features on MMAction2 framework. For example, adding new dataset or new models. Thus, you can use all tools we provided.
- [Install as a Python package](#install-as-a-python-package): You just want to call MMAction2's APIs or import MMAction2's modules in your project.
-### Install from source
+### Build MMAction2 from source
In this case, install mmaction2 from source:
@@ -80,7 +80,7 @@ git checkout dev-1.x
Just install with pip.
```shell
-pip install "mmaction2>=1.0rc0"
+pip install "mmaction2>=1.0.0"
```
## Verify the installation
@@ -167,7 +167,7 @@ This requires manually specifying a find-url based on PyTorch version and its CU
For example, the following command install mmcv built for PyTorch 1.10.x and CUDA 11.3.
```shell
-pip install 'mmcv>=2.0.0rc1' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
+pip install 'mmcv>=2.0.0' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
```
### Install on CPU-only platforms
@@ -193,3 +193,7 @@ Run it with
```shell
docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmaction2/data mmaction2
```
+
+## Troubleshooting
+
+coming soon...
diff --git a/docs/en/get_started/overview.md b/docs/en/get_started/overview.md
new file mode 100644
index 0000000000..0ddc07b275
--- /dev/null
+++ b/docs/en/get_started/overview.md
@@ -0,0 +1,97 @@
+# Overview
+
+## What is MMAction2
+
+MMAction2 is an open source toolkit based on PyTorch, supporting numerous video understanding models, including action recognition, skeleton-based action recognition, spatio-temporal action detection and temporal action localization. In addition, it supports widely-used academic datasets and provides many useful tools, assisting users in exploring various aspects of models and datasets and implementing high-quality algorithms. Generally, it has the following features.
+
+One-stop, Multi-model: MMAction2 supports various video understanding tasks and implements the latest models for action recognition, localization, detection.
+
+Modular Design: MMAction2’s modular design allows users to define and reuse modules in the model on demand.
+
+Various Useful Tools: MMAction2 provides many analysis tools, including visualizers, validation scripts, evaluators, etc., to help users troubleshoot, finetune or compare models.
+
+Powered by OpenMMLab: Like other algorithm libraries in OpenMMLab family, MMAction2 follows OpenMMLab’s rigorous development guidelines and interface conventions, significantly reducing the learning cost of users familiar with other projects in OpenMMLab family. In addition, benefiting from the unified interfaces among OpenMMLab, you can easily call the models implemented in other OpenMMLab projects (e.g. MMClassification) in MMAction2, facilitating cross-domain research and real-world applications.
+
+
+
+ Action Recognition |
+
+ Skeleton-based Action Recognition |
+
+
+
+ Spatio-Temporal Action Detection |
+
+ Spatio-Temporal Action Detection |
+
+
+## How to use the documentation
+
+We have prepared a wealth of documents to meet your various needs:
+
+
+For the basic usage of MMAction2
+
+- [Installation](installation.md)
+- [Quick Run](quick_run.md)
+- [Inference](../user_guides/Inference.md)
+
+
+
+
+For training on supported dataset
+
+- [learn about configs](../user_guides/config.md)
+- [prepare dataset](../user_guides/prepare_dataset.md)
+- [Training and testing](../user_guides/train_test.md)
+
+
+
+
+For looking for some common issues
+
+- [FAQs](faq.md)
+- [Useful tools](../useful_tools.md)
+
+
+
+
+For a general understanding about MMAction2
+
+- [20-minute tour to MMAction2](guide_to_framework.md)
+- [Data flow in MMAction2](../advanced_guides/dataflow.md)
+
+
+
+
+For advanced usage about custom training
+
+- [Customize models](../advanced_guides/customize_models.md)
+- [Customize datasets](../advanced_guides/customize_dataset.md)
+- [Customize data transformation and augmentation](../advanced_guides/customize_pipeline.md)
+- [Customize optimizer and scheduler](../advanced_guides/customize_optimizer.md)
+- [Customize logging](../advanced_guides/customize_logging.md)
+
+
+
+
+For supported model zoo and dataset zoo
+
+- [Model Zoo](../model_zoo/modelzoo.md)
+- [Dataset Zoo](../datasetzoo.md)
+
+
+
+
+For migration from MMAction2 0.x
+
+- [Migration](../migration.md)
+
+
+
+
+For researchers and developers who are willing to contribute to MMAction2
+
+- [Contribution Guide](contribution_guide.md)
+
+
diff --git a/docs/en/get_started/quick_run.md b/docs/en/get_started/quick_run.md
new file mode 100644
index 0000000000..84ae5b985f
--- /dev/null
+++ b/docs/en/get_started/quick_run.md
@@ -0,0 +1,221 @@
+# Quick Run
+
+This chapter will take you through the basic functions of MMAction2. And we assume you [installed MMAction2 from source](../installation#best-practices).
+
+- [Quick Run](#quick-run)
+ - [Inference](#inference)
+ - [Prepare a Dataset](#prepare-a-dataset)
+ - [Modify the Config](#modify-the-config)
+ - [Modify Dataset](#modify-dataset)
+ - [Modify Runtime Config](#modify-runtime-config)
+ - [Modify Model Config](#modify-model-config)
+ - [Browse the Dataset](#browse-the-dataset)
+ - [Training](#training)
+ - [Testing](#testing)
+
+## Inference
+
+Run the following in MMAction2's root directory:
+
+```shell
+python demo/demo_inferencer.py demo/demo.mp4 \
+ --rec tsn --print-result \
+ --label-file tools/data/kinetics/label_map_k400.txt
+```
+
+You should be able to see a pop-up video and the inference result printed out in the console.
+
+
+
+
+
+
+```bash
+# Inference result
+{'predictions': [{'rec_labels': [[6]], 'rec_scores': [[...]]}]}
+```
+
+```{note}
+If you are running MMAction2 on a server without GUI or via SSH tunnel with X11 forwarding disabled, you may not see the pop-up window.
+```
+
+A detailed description of MMAction2's inference interface can be found [here](/demo/README#inferencer)
+
+In addition to using our well-provided pre-trained models, you can also train models on your own datasets. In the next section, we will take you through the basic functions of MMAction2 by training TSN on the tiny [Kinetics](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) dataset as an example.
+
+## Prepare a Dataset
+
+Since the variety of video dataset formats are not conducive to switching datasets, MMAction2 proposes a uniform [data format](../user_guides/2_data_prepare.md), and provides [dataset preparer](../user_guides/data_prepare/dataset_preparer.md) for commonly used video datasets. Usually, to use those datasets in MMAction2, you just need to follow the steps to get them ready for use.
+
+```{note}
+But here, efficiency means everything.
+```
+
+Here, we have prepared a lite version of Kinetics dataset for demonstration purposes. Download our pre-prepared [zip](https://download.openmmlab.com/mmaction/kinetics400_tiny.zip) and extract it to the `data/` directory under mmaction2 to get our prepared video and annotation file.
+
+```Bash
+wget https://download.openmmlab.com/mmaction/kinetics400_tiny.zip
+mkdir -p data/
+unzip kinetics400_tiny.zip -d data/
+```
+
+## Modify the Config
+
+Once the dataset is prepared, we will then specify the location of the training set and the training parameters by modifying the config file.
+
+In this example, we will train a TSN using resnet50 as its backbone. Since MMAction2 already has a config file for the full Kinetics400 dataset (`configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`), we just need to make some modifications on top of it.
+
+### Modify Dataset
+
+We first need to modify the path to the dataset. Open `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` and replace keys as followed:
+
+```Python
+data_root = 'data/kinetics400_tiny/train'
+data_root_val = 'data/kinetics400_tiny/val'
+ann_file_train = 'data/kinetics400_tiny/kinetics_tiny_train_video.txt'
+ann_file_val = 'data/kinetics400_tiny/kinetics_tiny_val_video.txt'
+```
+
+### Modify Runtime Config
+
+Also, because of the reduced dataset size, we'd better reduce training batchsize to 4 and the number of training epochs to 10 accordingly, shorten the validation interval as well as the weight storage interval to 1 rounds, and modify the learning rate decay strategy. Modify corresponding keys in `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py` as following lines to take effect.
+
+```Python
+# set training batch size to 4
+train_dataloader['batch_size'] = 4
+
+# Save checkpoints every epoch, and only keep the latest checkpoint
+default_hooks = dict(
+ checkpoint=dict(type='CheckpointHook', interval=3, max_keep_ckpts=1,),
+ )
+# Set the maximum number of epochs to 10, and validate the model every 3 epochs
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=3)
+# adjust learning rate schedule according to 10 epochs
+param_scheduler = [
+ dict(
+ type='MultiStepLR',
+ begin=0,
+ end=10,
+ by_epoch=True,
+ milestones=[4, 8],
+ gamma=0.1)
+]
+```
+
+### Modify Model Config
+
+Further, due to the small size of tiny kinetics dataset, we'd better to load a pre-trained model on original Kinetics dataset. We also need to modify the model according to the actual number of classes. Just directly put the following lines into `configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py`.
+
+```Python
+
+model = dict(
+ cls_head=dict(num_classes=2))
+load_from = 'https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20220906-cd10898e.pth'
+```
+
+Here, we have rewritten the corresponding parameters in the base configuration directly through the inheritance ({external+mmengine:doc}`MMEngine: Config `) mechanism of the config. The original fields are distributed in `configs/_base_/models/tsn_r50.py`, `configs/_base_/schedules/sgd_100e.py` and `configs/_base_/default_runtime.py`.
+
+```{note}
+For a more detailed description of config, please refer to [here](../user_guides/1_config.md).
+```
+
+## Browse the Dataset
+
+Before we start the training, we can also visualize the frames processed by training-time [data transforms](<>). It's quite simple: pass the config file we need to visualize into the [browse_dataset.py](/tools/analysis_tools/browse_dataset.py) script.
+
+```Bash
+python tools/visualizations/browse_dataset.py \
+ configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \
+ browse_out --mode pipeline
+```
+
+The transformed videos will be saved to `browse_out` folder.
+
+
+
+
+
+```{note}
+For details on the parameters and usage of this script, please refer to [here](../user_guides/useful_tools.md).
+```
+
+```{tip}
+In addition to satisfying our curiosity, visualization can also help us check the parts that may affect the model's performance before training, such as problems in configs, datasets and data transforms.
+```
+
+we can further visualize the learning rate schedule to make sure that the config is as expected by following script:
+
+```Bash
+python tools/visualizations/vis_scheduler.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+The training learning rate schedule will be displayed in a pop-up window.
+
+
+
+
+
+```{note}
+The learning rate is auto scaled according to the actual batchsize.
+```
+
+## Training
+
+Start the training by running the following command:
+
+```Bash
+python tools/train.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+Depending on the system environment, MMAction2 will automatically use the best device for training. If a GPU is available, a single GPU training will be started by default. When you start to see the output of the losses, you have successfully started the training.
+
+```Bash
+03/24 16:36:15 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:15 - mmengine - INFO - Epoch(train) [1][8/8] lr: 1.5625e-04 eta: 0:00:15 time: 0.2151 data_time: 0.0845 memory: 1314 grad_norm: 8.5647 loss: 0.7267 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.7267
+03/24 16:36:16 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:16 - mmengine - INFO - Epoch(train) [2][8/8] lr: 1.5625e-04 eta: 0:00:12 time: 0.1979 data_time: 0.0717 memory: 1314 grad_norm: 8.4709 loss: 0.7130 top1_acc: 0.0000 top5_acc: 1.0000 loss_cls: 0.7130
+03/24 16:36:18 - mmengine - INFO - Exp name: tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb_20230324_163608
+03/24 16:36:18 - mmengine - INFO - Epoch(train) [3][8/8] lr: 1.5625e-04 eta: 0:00:10 time: 0.1691 data_time: 0.0478 memory: 1314 grad_norm: 8.2910 loss: 0.6900 top1_acc: 0.5000 top5_acc: 1.0000 loss_cls: 0.6900
+03/24 16:36:18 - mmengine - INFO - Saving checkpoint at 3 epochs
+03/24 16:36:19 - mmengine - INFO - Epoch(val) [3][1/1] acc/top1: 0.9000 acc/top5: 1.0000 acc/mean1: 0.9000data_time: 1.2716 time: 1.3658
+03/24 16:36:20 - mmengine - INFO - The best checkpoint with 0.9000 acc/top1 at 3 epoch is saved to best_acc/top1_epoch_3.pth.
+```
+
+Without extra configurations, model weights will be saved to `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/`, while the logs will be stored in `work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/TIMESTAMP/`. Next, we just need to wait with some patience for training to finish.
+
+```{note}
+For advanced usage of training, such as CPU training, multi-GPU training, and cluster training, please refer to [Training and Testing](../user_guides/train_test.md).
+```
+
+## Testing
+
+After 10 epochs, we observe that TSN performs best in the 6th epoch, with `acc/top1` reaching 1.0000:
+
+```Bash
+03/24 16:36:25 - mmengine - INFO - Epoch(val) [6][1/1] acc/top1: 1.0000 acc/top5: 1.0000 acc/mean1: 1.0000data_time: 1.0210 time: 1.1091
+```
+
+```{note}
+The result is pretty high due to pre-trained on original Kinetics400, you may see a different result.
+```
+
+However, this value only reflects the validation performance of TSN on the mini Kinetics dataset, While test results are usually higher due to more augmentation in test pipeline.
+
+Start testing:
+
+```Bash
+python tools/test.py configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py \
+ work_dirs/tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb/best_acc/top1_epoch_6.pth
+```
+
+And get the outputs like:
+
+```Bash
+03/24 17:00:59 - mmengine - INFO - Epoch(test) [10/10] acc/top1: 1.0000 acc/top5: 1.0000 acc/mean1: 0.9000data_time: 0.0420 time: 1.0795
+```
+
+The model achieves an hmean of 1.0000 on this dataset.
+
+```{note}
+For advanced usage of testing, such as CPU testing, multi-GPU testing, and cluster testing, please refer to [Training and Testing](../user_guides/train_test.md).
+```
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 59e3e49b53..ed4062534e 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -7,23 +7,38 @@ You can switch between Chinese and English documents in the lower-left corner of
:maxdepth: 1
:caption: Get Started
- get_started.md
+ get_started/overview.md
+ get_started/installation.md
+ get_started/quick_run.md
+ get_started/guide_to_framework.md
+ get_started/contribution_guide.md
+ get_started/faq.md
.. toctree::
:maxdepth: 1
:caption: User Guides
- user_guides/1_config.md
- user_guides/2_data_prepare.md
- user_guides/3_inference.md
- user_guides/4_train_test.md
+ user_guides/Inference.md
+ user_guides/config.md
+ user_guides/train_test.md
+ user_guides/prepare_dataset.md
.. toctree::
:maxdepth: 1
- :caption: Useful Tools
+ :caption: Advanced Guides
- user_guides/useful_tools.md
- user_guides/visualization.md
+ advanced_guides/dataflow.md
+ advanced_guides/customize_models.md
+ advanced_guides/customize_dataset.md
+ advanced_guides/customize_pipeline.md
+ advanced_guides/customize_optimizer.md
+ advanced_guides/customize_logging.md
+ advanced_guides/deploy.md
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Advanced Guides
+ useful_tools.md
.. toctree::
:maxdepth: 1
@@ -31,24 +46,42 @@ You can switch between Chinese and English documents in the lower-left corner of
migration.md
+.. toctree::
+ :maxdepth: 1
+ :caption: API Reference
+
+ api.rst
+
.. toctree::
:maxdepth: 1
:caption: Model Zoo
- modelzoo.md
- recognition_models.md
- detection_models.md
- skeleton_models.md
- localization_models.md
+ model_zoo/modelzoo.md
+ model_zoo/recognition_models.md
+ model_zoo/detection_models.md
+ model_zoo/skeleton_models.md
+ model_zoo/localization_models.md
+
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Dataset Zoo
+
+ datasetzoo_overview.md
+ datasetzoo.md
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Projects
+
+ projectzoo.md
.. toctree::
:maxdepth: 1
:caption: Notes
- notes/contribution_guide.md
- notes/projects.md
+ notes/ecosystem.md
notes/changelog.md
- notes/faq.md
.. toctree::
:caption: Switch Language
diff --git a/docs/en/merge_docs.sh b/docs/en/merge_docs.sh
index aa2a9bebfd..0d3c90ef0e 100644
--- a/docs/en/merge_docs.sh
+++ b/docs/en/merge_docs.sh
@@ -1,8 +1,48 @@
#!/usr/bin/env bash
-## gather models
-cat ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > localization_models.md
-cat ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > recognition_models.md
-cat ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >> recognition_models.md
-cat ../../configs/detection/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > detection_models.md
-cat ../../configs/skeleton/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\//](/g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/master/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > skeleton_models.md
+# gather models
+mkdir -p model_zoo
+cat ../../configs/localization/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Localization Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' |sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/localization_models.md
+cat ../../configs/recognition/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/recognition_models.md
+cat ../../configs/recognition_audio/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" >> model_zoo/recognition_models.md
+cat ../../configs/detection/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Spatio Temporal Action Detection Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/detection_models.md
+cat ../../configs/skeleton/*/README.md | sed "s/md#t/html#t/g" | sed "s/#/#&/" | sed '1i\# Skeleton-based Action Recognition Models' | sed 's/](\/docs\/en/](../g' | sed 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' | sed "s/getting_started.html##t/getting_started.html#t/g" > model_zoo/skeleton_models.md
+
+# gather projects
+# TODO: generate table of contents for project zoo
+cat ../../projects/README.md > projectzoo.md
+cat ../../projects/example_project/README.md >> projectzoo.md
+cat ../../projects/ctrgcn/README.md >> projectzoo.md
+cat ../../projects/msg3d/README.md >> projectzoo.md
+
+# gather datasets
+cat supported_datasets.md > datasetzoo.md
+cat ../../tools/data/*/README.md | sed 's/# Preparing/# /g' | sed 's/#/#&/' >> datasetzoo.md
+
+sed -i 's/(\/tools\/data\/activitynet\/README.md/(#activitynet/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/kinetics\/README.md/(#kinetics-400600700/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/mit\/README.md/(#moments-in-time/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/mmit\/README.md/(#multi-moments-in-time/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/sthv1\/README.md/(#something-something-v1/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/sthv2\/README.md/(#something-something-v2/g' datasetzoo.md
+sed -i "s/(\/tools\/data\/thumos14\/README.md/(#thumos14/g" datasetzoo.md
+sed -i 's/(\/tools\/data\/ucf101\/README.md/(#ucf-101/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/ucf101_24\/README.md/(#ucf101-24/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/jhmdb\/README.md/(#jhmdb/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/hvu\/README.md/(#hvu/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/hmdb51\/README.md/(#hmdb51/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/jester\/README.md/(#jester/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/ava\/README.md/(#ava/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/gym\/README.md/(#gym/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/omnisource\/README.md/(#omnisource/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/diving48\/README.md/(#diving48/g' datasetzoo.md
+sed -i 's/(\/tools\/data\/skeleton\/README.md/(#skeleton-dataset/g' datasetzoo.md
+
+cat prepare_data.md >> datasetzoo.md
+
+ sed -i 's=](/=](https://github.com/open-mmlab/mmaction2/tree/latest/=g' *.md
+
+sed -i 's/](\/docs\/en\//](g' datasetzoo.md
+sed -i 's/](\/docs\/en\//](g' changelog.md
+sed -i 's/](\/docs\/en\//](..g' ./get_stated/*.md
+sed -i 's/](\/docs\/en\//](..g' ./tutorials/*.md
diff --git a/docs/en/migration.md b/docs/en/migration.md
index 2917455f80..ea2ecac06c 100644
--- a/docs/en/migration.md
+++ b/docs/en/migration.md
@@ -4,10 +4,10 @@ MMAction2 1.x introduced major refactorings and modifications including some BC-
## New dependencies
-MMAction2 1.x depends on the following packages. You are recommended to prepare a new clean environment and install them according to [install tutorial](./get_started.md)
+MMAction2 1.x depends on the following packages. You are recommended to prepare a new clean environment and install them according to [install tutorial](./get_started/installation.md)
1. [MMEngine](https://github.com/open-mmlab/mmengine): MMEngine is a foundational library for training deep learning model introduced in OpenMMLab 2.0 architecture.
-2. [MMCV](https://github.com/open-mmlab/mmcv): MMCV is a foundational library for computer vision. MMAction2 1.x requires `mmcv>=2.0.0rc0` which is more compact and efficient than `mmcv-full==1.x`.
+2. [MMCV](https://github.com/open-mmlab/mmcv): MMCV is a foundational library for computer vision. MMAction2 1.x requires `mmcv>=2.0.0` which is more compact and efficient than `mmcv-full==2.0.0`.
## Configuration files
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
index bbf5b9ffbc..b4d785bc8a 100644
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@@ -1,5 +1,49 @@
# Changelog
+## 1.0.0 (4/6/2023)
+
+**Highlights**
+
+- Support RGB-PoseC3D(CVPR'2022).
+- Support training UniFormer V2(Arxiv'2022).
+- Support MSG3D(CVPR'2020) and CTRGCN(CVPR'2021) in projects.
+- Refactor and provide more user-friendly documentation.
+
+**New Features**
+
+- Support RGB-PoseC3D ([2182](https://github.com/open-mmlab/mmaction2/pull/2182))
+- Support training UniFormer V2 ([2221](https://github.com/open-mmlab/mmaction2/pull/2221))
+- Support MSG3D and CTRGCN in projects. ([2269](https://github.com/open-mmlab/mmaction2/pull/2269), [2291](https://github.com/open-mmlab/mmaction2/pull/2291))
+
+**Improvements**
+
+- Use MMEngine to calculate FLOPs ([2300](https://github.com/open-mmlab/mmaction2/pull/2300))
+- Speed up LFB training ([2294](https://github.com/open-mmlab/mmaction2/pull/2294))
+- Support multiprocessing on AVA evaluation ([2146](https://github.com/open-mmlab/mmaction2/pull/2146))
+- Add a demo for exporting spatial-temporal detection model to ONNX ([2225](https://github.com/open-mmlab/mmaction2/pull/2225))
+- Update spatial-temporal detection related folders ([2262](https://github.com/open-mmlab/mmaction2/pull/2262))
+
+**Bug Fixes**
+
+- Fix flip config of TSM for sth v1/v2 dataset ([#2247](https://github.com/open-mmlab/mmaction2/pull/2247))
+- Fix circle ci ([2336](https://github.com/open-mmlab/mmaction2/pull/2336), [2334](https://github.com/open-mmlab/mmaction2/pull/2334))
+- Fix accepting an unexpected argument local-rank in PyTorch 2.0 ([2320](https://github.com/open-mmlab/mmaction2/pull/2320))
+- Fix TSM config link ([2315](https://github.com/open-mmlab/mmaction2/pull/2315))
+- Fix numpy version requirement in CI ([2284](https://github.com/open-mmlab/mmaction2/pull/2284))
+- Fix NTU pose extraction script ([2246](https://github.com/open-mmlab/mmaction2/pull/2246))
+- Fix TSM-MobileNet V2 ([2332](https://github.com/open-mmlab/mmaction2/pull/2332))
+- Fix command bugs in localization tasks' README ([2244](https://github.com/open-mmlab/mmaction2/pull/2244))
+- Fix duplicate name in DecordInit and SampleAVAFrame ([2251](https://github.com/open-mmlab/mmaction2/pull/2251))
+- Fix channel order when showing video ([2308](https://github.com/open-mmlab/mmaction2/pull/2308))
+- Specify map_location to cpu when using \_load_checkpoint ([2252](https://github.com/open-mmlab/mmaction2/pull/2254))
+
+**Documentation**
+
+- Refactor and provide more user-friendly documentation ([2341](https://github.com/open-mmlab/mmaction2/pull/2341), [2312](https://github.com/open-mmlab/mmaction2/pull/2312), [2325](https://github.com/open-mmlab/mmaction2/pull/2325))
+- Add README_zh-CN ([2252](https://github.com/open-mmlab/mmaction2/pull/2252))
+- Add social networking links ([2294](https://github.com/open-mmlab/mmaction2/pull/2294))
+- Fix sthv2 dataset annotations preparation document ([2248](https://github.com/open-mmlab/mmaction2/pull/2248))
+
## 1.0.0rc3 (2/10/2023)
**Highlights**
diff --git a/docs/en/notes/projects.md b/docs/en/notes/ecosystem.md
similarity index 98%
rename from docs/en/notes/projects.md
rename to docs/en/notes/ecosystem.md
index f4bc5ac9e6..73b0fd6aaf 100644
--- a/docs/en/notes/projects.md
+++ b/docs/en/notes/ecosystem.md
@@ -1,4 +1,4 @@
-# Projects based on MMAction2
+# Ecosystem Projects based on MMAction2
There are many research works and projects built on MMAction2.
We list some of them as examples of how to extend MMAction2 for your own projects.
diff --git a/docs/en/notes/pytorch2.0.md b/docs/en/notes/pytorch2.0.md
new file mode 100644
index 0000000000..d50101490b
--- /dev/null
+++ b/docs/en/notes/pytorch2.0.md
@@ -0,0 +1,21 @@
+# PyTorch 2.0 Compatibility and Benchmark
+
+PyTorch introduced `torch.compile` in its 2.0 release. It compiles your model to speedup trainning & validation. We provide a benchmark result and compatibility of typical models in MMAction2. Except for one model (MViT) that fails to compile, the performance of other models remains consistent before and after compilation.
+
+| Config | compiled | Train time / iter (s) | GPU memory (M) | test metric |
+| ------------------------------------------------------------------------- | -------- | --------------------- | -------------- | ------------ |
+| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb | False | 0.50 | 42537 | 36.55 |
+| tsn_imagenet-pretrained-r50_8xb32-1x1x16-50e_sthv2-rgb | True | 0.61 | 53149 | 36.72 |
+| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb | False | 0.688 | 14263 | 77.69 |
+| timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb | True | 0.691 | 13863 | 77.57 |
+| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d | False | 0.0305 | 1184 | 91.69 |
+| stgcn_8xb16-bone-u100-80e_ntu60-xsub-keypoint-2d | True | 0.0298 | 1273 | 91.64 |
+| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint | False | 0.498 | 9581 | 93.6 |
+| slowonly_r50_8xb16-u48-240e_ntu60-xsub-keypoint | True | 0.505 | 11968 | 93.49 |
+| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb | False | 0.17 | 8278 | 20.76 |
+| slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb | True | 0.1835 | 12004 | 21.67 |
+| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb | False | 0.323 | 21651 | 78.90 |
+| swin-tiny-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb | True | 0.262 | 20905 | 78.70 |
+| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | False | 0.098 | 5777 | 75.12 |
+| slowonly_imagenet-pretrained-r50_8xb16-4x16x1-steplr-150e_kinetics400-rgb | True | 0.0942 | 7095 | 75.15 |
+| mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb | Fail | incompatible | incompatible | incompatible |
diff --git a/docs/en/stat.py b/docs/en/stat.py
index 80263653dc..b07d123fa8 100644
--- a/docs/en/stat.py
+++ b/docs/en/stat.py
@@ -16,7 +16,7 @@ def anchor(name):
# Count algorithms
-files = sorted(glob.glob('*_models.md'))
+files = sorted(glob.glob('model_zoo/*_models.md'))
# files = sorted(glob.glob('docs/*_models.md'))
stats = []
@@ -99,76 +99,76 @@ def anchor(name):
{msglist}
"""
-with open('modelzoo.md', 'w') as f:
+with open('model_zoo/modelzoo.md', 'w') as f:
f.write(modelzoo)
-# # Count datasets
-#
-# files = ['supported_datasets.md']
-# # files = sorted(glob.glob('docs/tasks/*.md'))
-#
-# datastats = []
-#
-# for f in files:
-# with open(f, 'r') as content_file:
-# content = content_file.read()
-#
-# # title
-# title = content.split('\n')[0].replace('#', '')
-#
-# # count papers
-# papers = set(
-# (papertype, titlecase.titlecase(paper.lower().strip()))
-# for (papertype, paper) in re.findall(
-# r'\s*\n.*?\btitle\s*=\s*{(.*?)}',
-# content, re.DOTALL))
-# # paper links
-# revcontent = '\n'.join(list(reversed(content.splitlines())))
-# paperlinks = {}
-# for _, p in papers:
-# print(p)
-# q = p.replace('\\', '\\\\').replace('?', '\\?')
-# paperlinks[p] = ', '.join(
-# (f'[{p.strip()} ->]({splitext(basename(f))[0]}.html#{anchor(p)})'
-# for p in re.findall(
-# rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n',
-# revcontent, re.DOTALL | re.IGNORECASE)))
-# print(' ', paperlinks[p])
-# paperlist = '\n'.join(
-# sorted(f' - [{t}] {x} ({paperlinks[x]})' for t, x in papers))
-#
-# statsmsg = f"""
-# ## [{title}]({f})
-#
-# * Number of papers: {len(papers)}
-# {paperlist}
-#
-# """
-#
-# datastats.append((papers, configs, ckpts, statsmsg))
-#
-# alldatapapers = func.reduce(lambda a, b: a.union(b),
-# [p for p, _, _, _ in datastats])
-#
-# # Summarize
-#
-# msglist = '\n'.join(x for _, _, _, x in stats)
-# datamsglist = '\n'.join(x for _, _, _, x in datastats)
-# papertypes, papercounts = np.unique([t for t, _ in alldatapapers],
-# return_counts=True)
-# countstr = '\n'.join(
-# [f' - {t}: {c}' for t, c in zip(papertypes, papercounts)])
-#
-# modelzoo = f"""
-# # Overview
-#
-# * Number of papers: {len(alldatapapers)}
-# {countstr}
-#
-# For supported action algorithms, see [modelzoo overview](modelzoo.md).
-#
-# {datamsglist}
-# """
-#
-# with open('datasets.md', 'w') as f:
-# f.write(modelzoo)
+# Count datasets
+
+files = ['datasetzoo.md']
+# files = sorted(glob.glob('docs/tasks/*.md'))
+
+datastats = []
+
+for f in files:
+ with open(f, 'r') as content_file:
+ content = content_file.read()
+
+ # title
+ title = content.split('\n')[0].replace('#', '')
+
+ # count papers
+ papers = set(
+ (papertype, titlecase.titlecase(paper.lower().strip()))
+ for (papertype, paper) in re.findall(
+ r'\s*\n.*?\btitle\s*=\s*{(.*?)}',
+ content, re.DOTALL))
+ # paper links
+ revcontent = '\n'.join(list(reversed(content.splitlines())))
+ paperlinks = {}
+ for _, p in papers:
+ print(p)
+ q = p.replace('\\', '\\\\').replace('?', '\\?')
+ paperlinks[p] = ', '.join(
+ (f'[{p.strip()} ->]({splitext(basename(f))[0]}.html#{anchor(p)})'
+ for p in re.findall(
+ rf'\btitle\s*=\s*{{\s*{q}\s*}}.*?\n## (.*?)\s*[,;]?\s*\n',
+ revcontent, re.DOTALL | re.IGNORECASE)))
+ print(' ', paperlinks[p])
+ paperlist = '\n'.join(
+ sorted(f' - [{t}] {x} ({paperlinks[x]})' for t, x in papers))
+
+ statsmsg = f"""
+## [{title}]({f})
+
+* Number of papers: {len(papers)}
+{paperlist}
+
+ """
+
+ datastats.append((papers, configs, ckpts, statsmsg))
+
+alldatapapers = func.reduce(lambda a, b: a.union(b),
+ [p for p, _, _, _ in datastats])
+
+# Summarize
+
+msglist = '\n'.join(x for _, _, _, x in stats)
+datamsglist = '\n'.join(x for _, _, _, x in datastats)
+papertypes, papercounts = np.unique([t for t, _ in alldatapapers],
+ return_counts=True)
+countstr = '\n'.join(
+ [f' - {t}: {c}' for t, c in zip(papertypes, papercounts)])
+
+datasetzoo = f"""
+# Overview
+
+* Number of papers: {len(alldatapapers)}
+{countstr}
+
+For supported action algorithms, see [modelzoo overview](modelzoo.md).
+
+{datamsglist}
+"""
+
+with open('datasetzoo_overview.md', 'w') as f:
+ f.write(datasetzoo)
diff --git a/docs/en/supported_datasets.md b/docs/en/supported_datasets.md
new file mode 100644
index 0000000000..42911fc8ff
--- /dev/null
+++ b/docs/en/supported_datasets.md
@@ -0,0 +1,36 @@
+# Supported Datasets
+
+- Action Recognition
+
+ - [UCF101](/tools/data/ucf101/README.md) \[ [Homepage](https://www.crcv.ucf.edu/research/data-sets/ucf101/) \].
+ - [HMDB51](/tools/data/hmdb51/README.md) \[ [Homepage](https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/) \].
+ - [Kinetics-\[400/600/700\]](/tools/data/kinetics/README.md) \[ [Homepage](https://deepmind.com/research/open-source/kinetics) \]
+ - [Something-Something V1](/tools/data/sthv1/README.md) \[ [Homepage](https://20bn.com/datasets/something-something/v1) \]
+ - [Something-Something V2](/tools/data/sthv2/README.md) \[ [Homepage](https://20bn.com/datasets/something-something) \]
+ - [Moments in Time](/tools/data/mit/README.md) \[ [Homepage](http://moments.csail.mit.edu/) \]
+ - [Multi-Moments in Time](/tools/data/mmit/README.md) \[ [Homepage](http://moments.csail.mit.edu/challenge_iccv_2019.html) \]
+ - [HVU](/tools/data/hvu/README.md) \[ [Homepage](https://github.com/holistic-video-understanding/HVU-Dataset) \]
+ - [Jester](/tools/data/jester/README.md) \[ [Homepage](https://developer.qualcomm.com/software/ai-datasets/jester) \]
+ - [GYM](/tools/data/gym/README.md) \[ [Homepage](https://sdolivia.github.io/FineGym/) \]
+ - [ActivityNet](/tools/data/activitynet/README.md) \[ [Homepage](http://activity-net.org/) \]
+ - [Diving48](/tools/data/diving48/README.md) \[ [Homepage](http://www.svcl.ucsd.edu/projects/resound/dataset.html) \]
+ - [OmniSource](/tools/data/omnisource/README.md) \[ [Homepage](https://kennymckormick.github.io/omnisource/) \]
+
+- Temporal Action Detection
+
+ - [ActivityNet](/tools/data/activitynet/README.md) \[ [Homepage](http://activity-net.org/) \]
+ - [THUMOS14](/tools/data/thumos14/README.md) \[ [Homepage](https://www.crcv.ucf.edu/THUMOS14/download.html) \]
+
+- Spatial Temporal Action Detection
+
+ - [AVA](/tools/data/ava/README.md) \[ [Homepage](https://research.google.com/ava/index.html) \]
+ - [UCF101-24](/tools/data/ucf101_24/README.md) \[ [Homepage](http://www.thumos.info/download.html) \]
+ - [JHMDB](/tools/data/jhmdb/README.md) \[ [Homepage](http://jhmdb.is.tue.mpg.de/) \]
+
+- Skeleton-based Action Recognition
+
+ - [PoseC3D Skeleton Dataset](/tools/data/skeleton/README.md) \[ [Homepage](https://kennymckormick.github.io/posec3d/) \]
+
+The supported datasets are listed above.
+We provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`.
+Below is the detailed tutorials of data deployment for each dataset.
diff --git a/docs/en/switch_language.md b/docs/en/switch_language.md
index 0009eafa9e..80cf0dc571 100644
--- a/docs/en/switch_language.md
+++ b/docs/en/switch_language.md
@@ -1,3 +1,3 @@
-## English
+## English
## 简体中文
diff --git a/docs/en/user_guides/useful_tools.md b/docs/en/useful_tools.md
similarity index 98%
rename from docs/en/user_guides/useful_tools.md
rename to docs/en/useful_tools.md
index 2fe3b1977a..943303b82c 100644
--- a/docs/en/user_guides/useful_tools.md
+++ b/docs/en/useful_tools.md
@@ -1,4 +1,4 @@
-# Other Useful Tools
+# Useful Tools
Apart from training/testing scripts, We provide lots of useful tools under the `tools/` directory.
@@ -6,7 +6,7 @@ Apart from training/testing scripts, We provide lots of useful tools under the `
-- [Other Useful Tools](#other-useful-tools)
+- [Useful Tools](#useful-tools)
- [Useful Tools Link](#useful-tools-link)
- [Model Conversion](#model-conversion)
- [Prepare a model for publishing](#prepare-a-model-for-publishing)
diff --git a/docs/en/user_guides/2_data_prepare.md b/docs/en/user_guides/2_data_prepare.md
deleted file mode 100644
index e3bcc9f0e0..0000000000
--- a/docs/en/user_guides/2_data_prepare.md
+++ /dev/null
@@ -1,152 +0,0 @@
-# Tutorial 2: Prepare Datasets
-
-We provide some tips for MMAction2 data preparation in this file.
-
-
-
-- [Notes on Video Data Format](#notes-on-video-data-format)
-- [Getting Data](#getting-data)
- - [Prepare videos](#prepare-videos)
- - [Extract frames](#extract-frames)
- - [Alternative to denseflow](#alternative-to-denseflow)
- - [Generate file list](#generate-file-list)
- - [Prepare audio](#prepare-audio)
-
-
-
-## Notes on Video Data Format
-
-MMAction2 supports two types of data format: raw frames and video. The former is widely used in previous projects such as [TSN](https://github.com/yjxiong/temporal-segment-networks).
-This is fast when SSD is available but fails to scale to the fast-growing datasets.
-(For example, the newest edition of [Kinetics](https://www.deepmind.com/open-source/kinetics) has 650K videos and the total frames will take up several TBs.)
-The latter saves much space but has to do the computation intensive video decoding at execution time.
-To make video decoding faster, we support several efficient video loading libraries, such as [decord](https://github.com/zhreshold/decord), [PyAV](https://github.com/PyAV-Org/PyAV), etc.
-
-## Getting Data
-
-The following guide is helpful when you want to experiment with custom dataset.
-Similar to the datasets stated above, it is recommended organizing in `$MMACTION2/data/$DATASET`.
-
-### Prepare videos
-
-Please refer to the official website and/or the official script to prepare the videos.
-Note that the videos should be arranged in either
-
-- A two-level directory organized by `${CLASS_NAME}/${VIDEO_ID}`, which is recommended to be used for action recognition datasets (such as UCF101 and Kinetics)
-
-- A single-level directory, which is recommended to be used for action detection datasets or those with multiple annotations per video (such as THUMOS14).
-
-### Extract frames
-
-To extract both frames and optical flow, you can use the tool [denseflow](https://github.com/open-mmlab/denseflow) we wrote.
-Since different frame extraction tools produce different number of frames,
-it is beneficial to use the same tool to do both frame extraction and the flow computation, to avoid mismatching of frame counts.
-
-```shell
-python build_rawframes.py ${SRC_FOLDER} ${OUT_FOLDER} [--task ${TASK}] [--level ${LEVEL}] \
- [--num-worker ${NUM_WORKER}] [--flow-type ${FLOW_TYPE}] [--out-format ${OUT_FORMAT}] \
- [--ext ${EXT}] [--new-width ${NEW_WIDTH}] [--new-height ${NEW_HEIGHT}] [--new-short ${NEW_SHORT}] \
- [--resume] [--use-opencv] [--mixed-ext]
-```
-
-- `SRC_FOLDER`: Folder of the original video.
-- `OUT_FOLDER`: Root folder where the extracted frames and optical flow store.
-- `TASK`: Extraction task indicating which kind of frames to extract. Allowed choices are `rgb`, `flow`, `both`.
-- `LEVEL`: Directory level. 1 for the single-level directory or 2 for the two-level directory.
-- `NUM_WORKER`: Number of workers to build rawframes.
-- `FLOW_TYPE`: Flow type to extract, e.g., `None`, `tvl1`, `warp_tvl1`, `farn`, `brox`.
-- `OUT_FORMAT`: Output format for extracted frames, e.g., `jpg`, `h5`, `png`.
-- `EXT`: Video file extension, e.g., `avi`, `mp4`.
-- `NEW_WIDTH`: Resized image width of output.
-- `NEW_HEIGHT`: Resized image height of output.
-- `NEW_SHORT`: Resized image short side length keeping ratio.
-- `--resume`: Whether to resume optical flow extraction instead of overwriting.
-- `--use-opencv`: Whether to use OpenCV to extract rgb frames.
-- `--mixed-ext`: Indicate whether process video files with mixed extensions.
-
-The recommended practice is
-
-1. set `$OUT_FOLDER` to be a folder located in SSD.
-2. symlink the link `$OUT_FOLDER` to `$MMACTION2/data/$DATASET/rawframes`.
-3. set `new-short` instead of using `new-width` and `new-height`.
-
-```shell
-ln -s ${YOUR_FOLDER} $MMACTION2/data/$DATASET/rawframes
-```
-
-#### Alternative to denseflow
-
-In case your device doesn't fulfill the installation requirement of [denseflow](https://github.com/open-mmlab/denseflow)(like Nvidia driver version), or you just want to see some quick demos about flow extraction, we provide a python script `tools/misc/flow_extraction.py` as an alternative to denseflow. You can use it for rgb frames and optical flow extraction from one or several videos. Note that the speed of the script is much slower than denseflow, since it runs optical flow algorithms on CPU.
-
-```shell
-python tools/misc/flow_extraction.py --input ${INPUT} [--prefix ${PREFIX}] [--dest ${DEST}] [--rgb-tmpl ${RGB_TMPL}] \
- [--flow-tmpl ${FLOW_TMPL}] [--start-idx ${START_IDX}] [--method ${METHOD}] [--bound ${BOUND}] [--save-rgb]
-```
-
-- `INPUT`: Videos for frame extraction, can be single video or a video list, the video list should be a txt file and just consists of filenames without directories.
-- `PREFIX`: The prefix of input videos, used when input is a video list.
-- `DEST`: The destination to save extracted frames.
-- `RGB_TMPL`: The template filename of rgb frames.
-- `FLOW_TMPL`: The template filename of flow frames.
-- `START_IDX`: The start index of extracted frames.
-- `METHOD`: The method used to generate flow.
-- `BOUND`: The maximum of optical flow.
-- `SAVE_RGB`: Also save extracted rgb frames.
-
-### Generate file list
-
-We provide a convenient script to generate annotation file list. You can use the following command to generate file lists given extracted frames / downloaded videos.
-
-```shell
-cd $MMACTION2
-python tools/data/build_file_list.py ${DATASET} ${SRC_FOLDER} [--rgb-prefix ${RGB_PREFIX}] \
- [--flow-x-prefix ${FLOW_X_PREFIX}] [--flow-y-prefix ${FLOW_Y_PREFIX}] [--num-split ${NUM_SPLIT}] \
- [--subset ${SUBSET}] [--level ${LEVEL}] [--format ${FORMAT}] [--out-root-path ${OUT_ROOT_PATH}] \
- [--seed ${SEED}] [--shuffle]
-```
-
-- `DATASET`: Dataset to be prepared, e.g., `ucf101`, `kinetics400`, `thumos14`, `sthv1`, `sthv2`, etc.
-- `SRC_FOLDER`: Folder of the corresponding data format:
- - "$MMACTION2/data/$DATASET/rawframes" if `--format rawframes`.
- - "$MMACTION2/data/$DATASET/videos" if `--format videos`.
-- `RGB_PREFIX`: Name prefix of rgb frames.
-- `FLOW_X_PREFIX`: Name prefix of x flow frames.
-- `FLOW_Y_PREFIX`: Name prefix of y flow frames.
-- `NUM_SPLIT`: Number of split to file list.
-- `SUBSET`: Subset to generate file list. Allowed choice are `train`, `val`, `test`.
-- `LEVEL`: Directory level. 1 for the single-level directory or 2 for the two-level directory.
-- `FORMAT`: Source data format to generate file list. Allowed choices are `rawframes`, `videos`.
-- `OUT_ROOT_PATH`: Root path for output
-- `SEED`: Random seed.
-- `--shuffle`: Whether to shuffle the file list.
-
-### Prepare audio
-
-We also provide a simple script for audio waveform extraction and mel-spectrogram generation.
-
-```shell
-cd $MMACTION2
-python tools/data/extract_audio.py ${ROOT} ${DST_ROOT} [--ext ${EXT}] [--num-workers ${N_WORKERS}] \
- [--level ${LEVEL}]
-```
-
-- `ROOT`: The root directory of the videos.
-- `DST_ROOT`: The destination root directory of the audios.
-- `EXT`: Extension of the video files. e.g., `mp4`.
-- `N_WORKERS`: Number of processes to be used.
-
-After extracting audios, you are free to decode and generate the spectrogram on-the-fly such as [this](/configs/recognition_audio/resnet/tsn_r18_8xb320-64x1x1-100e_kinetics400-audio.py). As for the annotations, you can directly use those of the rawframes as long as you keep the relative position of audio files same as the rawframes directory. However, extracting spectrogram on-the-fly is slow and bad for prototype iteration. Therefore, we also provide a script (and many useful tools to play with) for you to generation spectrogram off-line.
-
-```shell
-cd $MMACTION2
-python tools/data/build_audio_features.py ${AUDIO_HOME_PATH} ${SPECTROGRAM_SAVE_PATH} [--level ${LEVEL}] \
- [--ext $EXT] [--num-workers $N_WORKERS] [--part $PART]
-```
-
-- `AUDIO_HOME_PATH`: The root directory of the audio files.
-- `SPECTROGRAM_SAVE_PATH`: The destination root directory of the audio features.
-- `EXT`: Extension of the audio files. e.g., `m4a`.
-- `N_WORKERS`: Number of processes to be used.
-- `PART`: Determines how many parts to be splited and which part to run. e.g., `2/5` means splitting all files into 5-fold and executing the 2nd part. This is useful if you have several machines.
-
-The annotations for audio spectrogram features are identical to those of rawframes. You can simply make a copy of `dataset_[train/val]_list_rawframes.txt` and rename it as `dataset_[train/val]_list_audio_feature.txt`
diff --git a/docs/en/user_guides/3_inference.md b/docs/en/user_guides/Inference.md
similarity index 95%
rename from docs/en/user_guides/3_inference.md
rename to docs/en/user_guides/Inference.md
index 11b07f0519..20e14b4ee0 100644
--- a/docs/en/user_guides/3_inference.md
+++ b/docs/en/user_guides/Inference.md
@@ -1,9 +1,9 @@
-# Tutorial 3: Inference with existing models
+# Inference with existing models
MMAction2 provides pre-trained models for video understanding in [Model Zoo](../modelzoo.md).
This note will show **how to use existing models to inference on given video**.
-As for how to test existing models on standard datasets, please see this [guide](./4_train_test.md#test)
+As for how to test existing models on standard datasets, please see this [guide](./train_test.md#test)
## Inference on a given video
diff --git a/docs/en/user_guides/1_config.md b/docs/en/user_guides/config.md
similarity index 98%
rename from docs/en/user_guides/1_config.md
rename to docs/en/user_guides/config.md
index 308ec70f17..d847ae9557 100644
--- a/docs/en/user_guides/1_config.md
+++ b/docs/en/user_guides/config.md
@@ -1,4 +1,4 @@
-# Tutorial 1: Learn about Configs
+# Learn about Configs
We use python files as configs, incorporate modular and inheritance design into our config system, which is convenient to conduct various experiments.
You can find all the provided configs under `$MMAction2/configs`. If you wish to inspect the config file,
@@ -6,12 +6,13 @@ you may run `python tools/analysis_tools/print_config.py /PATH/TO/CONFIG` to see
-- [Modify config through script arguments](#modify-config-through-script-arguments)
-- [Config File Structure](#config-file-structure)
-- [Config File Naming Convention](#config-file-naming-convention)
- - [Config System for Action Recognition](#config-system-for-action-recognition)
- - [Config System for Spatio-Temporal Action Detection](#config-system-for-spatio-temporal-action-detection)
- - [Config System for Action localization](#config-system-for-action-localization)
+- [Learn about Configs](#learn-about-configs)
+ - [Modify config through script arguments](#modify-config-through-script-arguments)
+ - [Config File Structure](#config-file-structure)
+ - [Config File Naming Convention](#config-file-naming-convention)
+ - [Config System for Action Recognition](#config-system-for-action-recognition)
+ - [Config System for Spatio-Temporal Action Detection](#config-system-for-spatio-temporal-action-detection)
+ - [Config System for Action localization](#config-system-for-action-localization)
diff --git a/docs/en/user_guides/prepare_dataset.md b/docs/en/user_guides/prepare_dataset.md
new file mode 100644
index 0000000000..cd4225aaa0
--- /dev/null
+++ b/docs/en/user_guides/prepare_dataset.md
@@ -0,0 +1,263 @@
+# Prepare Dataset
+
+MMAction2 supports many existing datasets. In this chapter, we will lead you to prepare datasets for MMAction2.
+
+- [Prepare Dataset](#prepare-dataset)
+ - [Notes on Video Data Format](#notes-on-video-data-format)
+ - [Use built-in datasets](#use-built-in-datasets)
+ - [Use a custom dataset](#use-a-custom-dataset)
+ - [Action Recognition](#action-recognition)
+ - [Skeleton-based Action Recognition](#skeleton-based-action-recognition)
+ - [Spatio-temporal Action Detection](#spatio-temporal-action-detection)
+ - [Temporal Action Localization](#temporal-action-localization)
+ - [Use mixed datasets for training](#use-mixed-datasets-for-training)
+ - [Repeat dataset](#repeat-dataset)
+ - [Browse dataset](#browse-dataset)
+
+## Notes on Video Data Format
+
+MMAction2 supports two types of data formats: raw frames and video. The former is widely used in previous projects such as [TSN](https://github.com/yjxiong/temporal-segment-networks).
+This is fast when SSD is available but fails to scale to the fast-growing datasets.
+(For example, the newest edition of [Kinetics](https://www.deepmind.com/open-source/kinetics) has 650K videos and the total frames will take up several TBs.)
+The latter saves much space but has to do the computation intensive video decoding at execution time.
+To make video decoding faster, we support several efficient video loading libraries, such as [decord](https://github.com/zhreshold/decord), [PyAV](https://github.com/PyAV-Org/PyAV), etc.
+
+## Use built-in datasets
+
+MMAction2 already supports many datasets, we provide shell scripts for data preparation under the path `$MMACTION2/tools/data/`, please refer to [supported datasets](../supported_datasets.md) for details to prepare specific datasets.
+
+## Use a custom dataset
+
+The simplest way is to convert your dataset to existing dataset formats:
+
+- `RawFrameDataset` and `VideoDataset` for [Action Recognition](#action-recognition)
+- `PoseDataset` for [Skeleton-based Action Recognition](#skeleton-based-action-recognition)
+- `AVADataset` for [Spatio-temporal Action Detection](#spatio-temporal-action-detection)
+- `ActivityNetDataset` for [Temporal Action Localization](#temporal-action-localization)
+
+After the data pre-processing, the users need to further modify the config files to use the dataset.
+Here is an example of using a custom dataset in rawframe format.
+
+In `configs/task/method/my_custom_config.py`:
+
+```python
+...
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'path/to/your/root'
+data_root_val = 'path/to/your/root_val'
+ann_file_train = 'data/custom/custom_train_list.txt'
+ann_file_val = 'data/custom/custom_val_list.txt'
+ann_file_test = 'data/custom/custom_val_list.txt'
+...
+data = dict(
+ videos_per_gpu=32,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ ann_file=ann_file_train,
+ ...),
+ val=dict(
+ type=dataset_type,
+ ann_file=ann_file_val,
+ ...),
+ test=dict(
+ type=dataset_type,
+ ann_file=ann_file_test,
+ ...))
+...
+```
+
+### Action Recognition
+
+There are two kinds of annotation files for action recognition.
+
+- rawframe annotaiton for `RawFrameDataset`
+
+ The annotation of a rawframe dataset is a text file with multiple lines,
+ and each line indicates `frame_directory` (relative path) of a video,
+ `total_frames` of a video and the `label` of a video, which are split by a whitespace.
+
+ Here is an example.
+
+ ```
+ some/directory-1 163 1
+ some/directory-2 122 1
+ some/directory-3 258 2
+ some/directory-4 234 2
+ some/directory-5 295 3
+ some/directory-6 121 3
+ ```
+
+- video annotation for `VideoDataset`
+
+ The annotation of a video dataset is a text file with multiple lines,
+ and each line indicates a sample video with the `filepath` (relative path) and `label`,
+ which are split by a whitespace.
+
+ Here is an example.
+
+ ```
+ some/path/000.mp4 1
+ some/path/001.mp4 1
+ some/path/002.mp4 2
+ some/path/003.mp4 2
+ some/path/004.mp4 3
+ some/path/005.mp4 3
+ ```
+
+### Skeleton-based Action Recognition
+
+The task recognizes the action class based on the skeleton sequence (time sequence of keypoints). We provide some methods to build your custom skeleton dataset.
+
+- Build from RGB video data
+
+ You need to extract keypoints data from video and convert it to a supported format, we provide a [tutorial](/configs/skeleton/posec3d/custom_dataset_training.md) with detailed instructions.
+
+- Build from existing keypoint data
+
+ Assuming that you already have keypoint data in coco formats, you can gather them into a pickle file.
+
+ Each pickle file corresponds to an action recognition dataset. The content of a pickle file is a dictionary with two fields: `split` and `annotations`
+
+ 1. Split: The value of the `split` field is a dictionary: the keys are the split names, while the values are lists of video identifiers that belong to the specific clip.
+ 2. Annotations: The value of the `annotations` field is a list of skeleton annotations, each skeleton annotation is a dictionary, containing the following fields:
+ - `frame_dir` (str): The identifier of the corresponding video.
+ - `total_frames` (int): The number of frames in this video.
+ - `img_shape` (tuple\[int\]): The shape of a video frame, a tuple with two elements, in the format of `(height, width)`. Only required for 2D skeletons.
+ - `original_shape` (tuple\[int\]): Same as `img_shape`.
+ - `label` (int): The action label.
+ - `keypoint` (np.ndarray, with shape `[M x T x V x C]`): The keypoint annotation.
+ - M: number of persons;
+ - T: number of frames (same as `total_frames`);
+ - V: number of keypoints (25 for NTURGB+D 3D skeleton, 17 for CoCo, 18 for OpenPose, etc. );
+ - C: number of dimensions for keypoint coordinates (C=2 for 2D keypoint, C=3 for 3D keypoint).
+ - `keypoint_score` (np.ndarray, with shape `[M x T x V]`): The confidence score of keypoints. Only required for 2D skeletons.
+
+ Here is an example:
+
+ ```
+ {
+ "split":
+ {
+ 'xsub_train':
+ ['S001C001P001R001A001', ...],
+ 'xsub_val':
+ ['S001C001P003R001A001', ...],
+ ...
+ }
+
+ "annotations:
+ [
+ {
+ {
+ 'frame_dir': 'S001C001P001R001A001',
+ 'label': 0,
+ 'img_shape': (1080, 1920),
+ 'original_shape': (1080, 1920),
+ 'total_frames': 103,
+ 'keypoint': array([[[[1032. , 334.8], ...]]])
+ 'keypoint_score': array([[[0.934 , 0.9766, ...]]])
+ },
+ {
+ 'frame_dir': 'S001C001P003R001A001',
+ ...
+ },
+ ...
+
+ }
+ ]
+ }
+ ```
+
+ Support other keypoint formats needs further modification, please refer to [customize dataset](../advanced_guides/customize_dataset.md).
+
+### Spatio-temporal Action Detection
+
+MMAction2 supports the task based on `AVADataset`. The annotation contains groundtruth bbox and proposal bbox.
+
+- groundtruth bbox
+ groundtruth bbox is a csv file with multiple lines, and each line is a detection sample of one frame, with following formats:
+
+ video_identifier, time_stamp, lt_x, lt_y, rb_x, rb_y, label, entity_id
+ each field means:
+ `video_identifier` : The identifier of the corresponding video
+ `time_stamp`: The time stamp of current frame
+ `lt_x`: The normalized x-coordinate of the left top point of bounding box
+ `lt_y`: The normalized y-coordinate of the left top point of bounding box
+ `rb_y`: The normalized x-coordinate of the right bottom point of bounding box
+ `rb_y`: The normalized y-coordinate of the right bottom point of bounding box
+ `label`: The action label
+ `entity_id`: a unique integer allowing this box to be linked to other boxes depicting the same person in adjacent frames of this video
+
+ Here is an example.
+
+ ```
+ _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,12,0
+ _-Z6wFjXtGQ,0902,0.063,0.049,0.524,0.996,74,0
+ ...
+ ```
+
+- proposal bbox
+ proposal bbox is a pickle file generated by a person detector, and usually needs to be fine-tuned on the target dataset. The pickle file contains a dict with below data structure:
+
+ `{'video_identifier,time_stamp': bbox_info}`
+
+ video_identifier (str): The identifier of the corresponding video
+ time_stamp (int): The time stamp of current frame
+ bbox_info (np.ndarray, with shape `[n, 5]`): Detected bbox, \ \ \ \ \. x1, x2, y1, y2 are normalized with respect to frame size, which are between 0.0-1.0.
+
+### Temporal Action Localization
+
+We support Temporal Action Localization based on `ActivityNetDataset`. The annotation of ActivityNet dataset is a json file. Each key is a video name and the corresponding value is the meta data and annotation for the video.
+
+Here is an example.
+
+```
+{
+ "video1": {
+ "duration_second": 211.53,
+ "duration_frame": 6337,
+ "annotations": [
+ {
+ "segment": [
+ 30.025882995319815,
+ 205.2318595943838
+ ],
+ "label": "Rock climbing"
+ }
+ ],
+ "feature_frame": 6336,
+ "fps": 30.0,
+ "rfps": 29.9579255898
+ },
+ "video2": {...
+ }
+ ...
+}
+```
+
+## Use mixed datasets for training
+
+MMAction2 also supports to mix dataset for training. Currently it supports to repeat dataset.
+
+### Repeat dataset
+
+We use `RepeatDataset` as wrapper to repeat the dataset. For example, suppose the original dataset as `Dataset_A`,
+to repeat it, the config looks like the following
+
+```python
+dataset_A_train = dict(
+ type='RepeatDataset',
+ times=N,
+ dataset=dict( # This is the original config of Dataset_A
+ type='Dataset_A',
+ ...
+ pipeline=train_pipeline
+ )
+ )
+```
+
+## Browse dataset
+
+coming soon...
diff --git a/docs/en/user_guides/4_train_test.md b/docs/en/user_guides/train_test.md
similarity index 99%
rename from docs/en/user_guides/4_train_test.md
rename to docs/en/user_guides/train_test.md
index a67448fde3..653fccdc34 100644
--- a/docs/en/user_guides/4_train_test.md
+++ b/docs/en/user_guides/train_test.md
@@ -1,4 +1,4 @@
-# Tutorial 4: Training and Test
+# Training and Test
## Training
diff --git a/docs/en/user_guides/visualization.md b/docs/en/user_guides/visualization.md
deleted file mode 100644
index 2d4518bcdb..0000000000
--- a/docs/en/user_guides/visualization.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Visualization Tools
-
-## Visualize dataset
-
-You can use `tools/analysis_tools/browse_dataset.py` to visualize video datasets:
-
-```bash
-python tools/analysis_tools/browse_dataset.py ${CONFIG_FILE} [ARGS]
-```
-
-| ARGS | Description |
-| ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `CONFIG_FILE` | The path to the config file. |
-| `--output-dir OUTPUT_DIR` | If there is no display interface, you can save the visualization results to `OUTPUT_DIR`. Defaults to None |
-| `--show-frames` | Display the frames of the video if you have the display interface. Defaults to False. |
-| `--phase PHASE` | Phase of the dataset to visualize, accept `train`, `test` and `val`. Defaults to `train`. |
-| `--show-number SHOW_NUMBER` | Number of images selected to visualize, must bigger than 0. Jf the number is bigger than length of dataset, show all the images in dataset. Defaults to "sys.maxsize", show all images in dataset |
-| `--show-interval SHOW_INTERVAL` | The interval of show (s). Defaults to 2. |
-| `--mode MODE` | Display mode: display original videos or transformed videos. `original` means show videos load from disk while `transformed` means to show videos after transformed. Defaults to `transformed`. |
-| `--cfg-options CFG_OPTIONS` | Override some settings in the used config, the key-value pair in xxx=yyy format will be merged into the config file. If the value to be overwritten is a list, it should be of the form of either `key="[a,b]"` or `key=a,b`. The argument also allows nested list/tuple values, e.g. `key="[(a,b),(c,d)]"`. Note that the quotation marks are necessary and that no white space is allowed. |
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
index b98358d166..51742edc72 100644
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@@ -46,7 +46,7 @@ conda install pytorch torchvision cpuonly -c pytorch
```shell
pip install -U openmim
-mim install mmengine 'mmcv>=2.0.0rc1'
+mim install mmengine 'mmcv>=2.0.0'
```
**第二步** 安装 MMAction2。
@@ -80,7 +80,7 @@ git checkout dev-1.x
直接使用 pip 安装即可。
```shell
-pip install "mmaction2>=1.0rc0"
+pip install "mmaction2>=1.0.0"
```
## 验证安装
@@ -158,7 +158,7 @@ MMCV 包含 C++ 和 CUDA 扩展,因此其对 PyTorch 的依赖比较复杂。
例如,以下命令安装为 PyTorch 1.10.x 和 CUDA 11.3 构建的 mmcv。
```shell
-pip install 'mmcv>=2.0.0rc1' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
+pip install 'mmcv>=2.0.0' -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
```
### 在 CPU 环境中安装
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 59e3e49b53..2b69d6d2af 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -16,7 +16,7 @@ You can switch between Chinese and English documents in the lower-left corner of
user_guides/1_config.md
user_guides/2_data_prepare.md
user_guides/3_inference.md
- user_guides/4_train_test.md
+ user_guides/train_test.md
.. toctree::
:maxdepth: 1
diff --git a/docs/zh_cn/switch_language.md b/docs/zh_cn/switch_language.md
index 0009eafa9e..80cf0dc571 100644
--- a/docs/zh_cn/switch_language.md
+++ b/docs/zh_cn/switch_language.md
@@ -1,3 +1,3 @@
-## English
+## English
## 简体中文
diff --git a/docs/zh_cn/user_guides/3_inference.md b/docs/zh_cn/user_guides/3_inference.md
index 99433263df..14374ef432 100644
--- a/docs/zh_cn/user_guides/3_inference.md
+++ b/docs/zh_cn/user_guides/3_inference.md
@@ -3,7 +3,7 @@
MMAction2 在 [Model Zoo](../modelzoo.md) 中提供预训练的视频理解模型。
本教程将展示**如何使用现有模型对给定视频进行推理**。
-至于如何在标准数据集上测试现有模型,请参阅这该[指南](./4_train_test.md#test)
+至于如何在标准数据集上测试现有模型,请参阅这该[指南](./train_test.md#test)
## 给定视频的推理
diff --git a/mmaction/apis/inference.py b/mmaction/apis/inference.py
index ac014d0350..d0a4c01501 100644
--- a/mmaction/apis/inference.py
+++ b/mmaction/apis/inference.py
@@ -104,7 +104,8 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config],
frame_paths: List[str],
det_score_thr: float = 0.9,
det_cat_id: int = 0,
- device: Union[str, torch.device] = 'cuda:0') -> tuple:
+ device: Union[str, torch.device] = 'cuda:0',
+ with_score: bool = False) -> tuple:
"""Detect human boxes given frame paths.
Args:
@@ -117,6 +118,8 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config],
det_cat_id (int): The category id for human detection. Defaults to 0.
device (Union[str, torch.device]): The desired device of returned
tensor. Defaults to ``'cuda:0'``.
+ with_score (bool): Whether to append detection score after box.
+ Defaults to None.
Returns:
List[np.ndarray]: List of detected human boxes.
@@ -141,10 +144,16 @@ def detection_inference(det_config: Union[str, Path, mmengine.Config],
det_data_sample: DetDataSample = inference_detector(model, frame_path)
pred_instance = det_data_sample.pred_instances.cpu().numpy()
bboxes = pred_instance.bboxes
+ scores = pred_instance.scores
# We only keep human detection bboxs with score larger
# than `det_score_thr` and category id equal to `det_cat_id`.
- bboxes = bboxes[np.logical_and(pred_instance.labels == det_cat_id,
- pred_instance.scores > det_score_thr)]
+ valid_idx = np.logical_and(pred_instance.labels == det_cat_id,
+ pred_instance.scores > det_score_thr)
+ bboxes = bboxes[valid_idx]
+ scores = scores[valid_idx]
+
+ if with_score:
+ bboxes = np.concatenate((bboxes, scores[:, None]), axis=-1)
results.append(bboxes)
data_samples.append(det_data_sample)
@@ -187,7 +196,7 @@ def pose_inference(pose_config: Union[str, Path, mmengine.Config],
print('Performing Human Pose Estimation for each frame')
for f, d in track_iter_progress(list(zip(frame_paths, det_results))):
pose_data_samples: List[PoseDataSample] \
- = inference_topdown(model, f, d, bbox_format='xyxy')
+ = inference_topdown(model, f, d[..., :4], bbox_format='xyxy')
pose_data_sample = merge_data_samples(pose_data_samples)
pose_data_sample.dataset_meta = model.dataset_meta
poses = pose_data_sample.pred_instances.to_dict()
diff --git a/mmaction/datasets/pose_dataset.py b/mmaction/datasets/pose_dataset.py
index 52c2c0b668..a06a7f7c0d 100644
--- a/mmaction/datasets/pose_dataset.py
+++ b/mmaction/datasets/pose_dataset.py
@@ -1,10 +1,10 @@
# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Callable, List, Optional, Union
+import os.path as osp
+from typing import Callable, Dict, List, Optional, Union
-from mmengine.fileio import exists, load
+import mmengine
from mmaction.registry import DATASETS
-from mmaction.utils import ConfigType
from .base import BaseActionDataset
@@ -21,38 +21,44 @@ class PoseDataset(BaseActionDataset):
Args:
ann_file (str): Path to the annotation file.
- pipeline (list): A sequence of data transforms.
- split (str, optional): The dataset split used. Only applicable to
- ``UCF`` or ``HMDB``. Allowed choices are ``train1``, ``test1``,
- ``train2``, ``test2``, ``train3``, ``test3``. Defaults to None.
- start_index (int): Specify a start index for frames in consideration of
- different filename format. Defaults to 0.
+ pipeline (list[dict | callable]): A sequence of data transforms.
+ split (str, optional): The dataset split used. For UCF101 and
+ HMDB51, allowed choices are 'train1', 'test1', 'train2',
+ 'test2', 'train3', 'test3'. For NTURGB+D, allowed choices
+ are 'xsub_train', 'xsub_val', 'xview_train', 'xview_val'.
+ For NTURGB+D 120, allowed choices are 'xsub_train',
+ 'xsub_val', 'xset_train', 'xset_val'. For FineGYM,
+ allowed choices are 'train', 'val'. Defaults to None.
"""
def __init__(self,
ann_file: str,
- pipeline: List[Union[ConfigType, Callable]],
+ pipeline: List[Union[Dict, Callable]],
split: Optional[str] = None,
- start_index: int = 0,
**kwargs) -> None:
- # split, applicable to ``ucf101`` or ``hmdb51``
self.split = split
super().__init__(
- ann_file,
- pipeline=pipeline,
- start_index=start_index,
- modality='Pose',
- **kwargs)
+ ann_file, pipeline=pipeline, modality='Pose', **kwargs)
- def load_data_list(self) -> List[dict]:
+ def load_data_list(self) -> List[Dict]:
"""Load annotation file to get skeleton information."""
assert self.ann_file.endswith('.pkl')
- exists(self.ann_file)
- data_list = load(self.ann_file)
+ mmengine.exists(self.ann_file)
+ data_list = mmengine.load(self.ann_file)
if self.split is not None:
- split, data = data_list['split'], data_list['annotations']
- identifier = 'filename' if 'filename' in data[0] else 'frame_dir'
- data_list = [x for x in data if x[identifier] in split[self.split]]
+ split, annos = data_list['split'], data_list['annotations']
+ identifier = 'filename' if 'filename' in annos[0] else 'frame_dir'
+ split = set(split[self.split])
+ data_list = [x for x in annos if x[identifier] in split]
+ # Sometimes we may need to load video from the file
+ if 'video' in self.data_prefix:
+ for item in data_list:
+ if 'filename' in item:
+ item['filename'] = osp.join(self.data_prefix['video'],
+ item['filename'])
+ if 'frame_dir' in item:
+ item['frame_dir'] = osp.join(self.data_prefix['video'],
+ item['frame_dir'])
return data_list
diff --git a/mmaction/datasets/transforms/__init__.py b/mmaction/datasets/transforms/__init__.py
index 198bd8c781..2b83c415f5 100644
--- a/mmaction/datasets/transforms/__init__.py
+++ b/mmaction/datasets/transforms/__init__.py
@@ -11,7 +11,8 @@
PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames,
UniformSample, UntrimmedSampleFrames)
from .pose_transforms import (GeneratePoseTarget, GenSkeFeat, JointToBone,
- LoadKineticsPose, MergeSkeFeat, PadTo,
+ LoadKineticsPose, MergeSkeFeat, MMCompact,
+ MMDecode, MMUniformSampleFrames, PadTo,
PoseCompact, PoseDecode, PreNormalize2D,
PreNormalize3D, ToMotion, UniformSampleFrames)
from .processing import (AudioAmplify, CenterCrop, ColorJitter, Flip, Fuse,
@@ -36,5 +37,6 @@
'RandomCrop', 'RandomRescale', 'RandomResizedCrop', 'RawFrameDecode',
'Resize', 'SampleAVAFrames', 'SampleFrames', 'TenCrop', 'ThreeCrop',
'ToMotion', 'TorchVisionWrapper', 'Transpose', 'UniformSample',
- 'UniformSampleFrames', 'UntrimmedSampleFrames'
+ 'UniformSampleFrames', 'UntrimmedSampleFrames', 'MMUniformSampleFrames',
+ 'MMDecode', 'MMCompact'
]
diff --git a/mmaction/datasets/transforms/formatting.py b/mmaction/datasets/transforms/formatting.py
index 7477f51080..bdcc75ffb5 100644
--- a/mmaction/datasets/transforms/formatting.py
+++ b/mmaction/datasets/transforms/formatting.py
@@ -1,5 +1,5 @@
# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, Sequence
+from typing import Dict, Optional, Sequence, Tuple
import numpy as np
import torch
@@ -38,9 +38,11 @@ class PackActionInputs(BaseTransform):
def __init__(
self,
+ collect_keys: Optional[Tuple[str]] = None,
meta_keys: Sequence[str] = ('img_shape', 'img_key', 'video_id',
'timestamp')
) -> None:
+ self.collect_keys = collect_keys
self.meta_keys = meta_keys
def transform(self, results: Dict) -> Dict:
@@ -53,19 +55,27 @@ def transform(self, results: Dict) -> Dict:
dict: The result dict.
"""
packed_results = dict()
- if 'imgs' in results:
- imgs = results['imgs']
- packed_results['inputs'] = to_tensor(imgs)
- elif 'keypoint' in results:
- keypoint = results['keypoint']
- packed_results['inputs'] = to_tensor(keypoint)
- elif 'audios' in results:
- audios = results['audios']
- packed_results['inputs'] = to_tensor(audios)
+ if self.collect_keys is not None:
+ packed_results['inputs'] = dict()
+ for key in self.collect_keys:
+ packed_results['inputs'][key] = to_tensor(results[key])
else:
- raise ValueError(
- 'Cannot get `imgs`, `keypoint` or `audios` in the input dict '
- 'of `PackActionInputs`.')
+ if 'imgs' in results:
+ imgs = results['imgs']
+ packed_results['inputs'] = to_tensor(imgs)
+ elif 'heatmap_imgs' in results:
+ heatmap_imgs = results['heatmap_imgs']
+ packed_results['inputs'] = to_tensor(heatmap_imgs)
+ elif 'keypoint' in results:
+ keypoint = results['keypoint']
+ packed_results['inputs'] = to_tensor(keypoint)
+ elif 'audios' in results:
+ audios = results['audios']
+ packed_results['inputs'] = to_tensor(audios)
+ else:
+ raise ValueError(
+ 'Cannot get `imgs`, `keypoint`, `heatmap_imgs` '
+ 'or `audios` in the input dict of `PackActionInputs`.')
data_sample = ActionDataSample()
@@ -91,7 +101,8 @@ def transform(self, results: Dict) -> Dict:
def __repr__(self) -> str:
repr_str = self.__class__.__name__
- repr_str += f'(meta_keys={self.meta_keys})'
+ repr_str += f'(collect_keys={self.collect_keys}, '
+ repr_str += f'meta_keys={self.meta_keys})'
return repr_str
@@ -178,16 +189,20 @@ class FormatShape(BaseTransform):
"""Format final imgs shape to the given input_format.
Required keys:
- - imgs
+ - imgs (optional)
+ - heatmap_imgs (optional)
- num_clips
- clip_len
Modified Keys:
- - img
- - input_shape
+ - imgs (optional)
+ - input_shape (optional)
+
+ Added Keys:
+ - heatmap_input_shape (optional)
Args:
- input_format (str): Define the final imgs format.
+ input_format (str): Define the final data format.
collapse (bool): To collapse input_format N... to ... (NCTHW to CTHW,
etc.) if N is 1. Should be set as True when training and testing
detectors. Defaults to False.
@@ -196,11 +211,13 @@ class FormatShape(BaseTransform):
def __init__(self, input_format: str, collapse: bool = False) -> None:
self.input_format = input_format
self.collapse = collapse
- if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow', 'NPTCHW']:
+ if self.input_format not in [
+ 'NCTHW', 'NCHW', 'NCHW_Flow', 'NCTHW_Heatmap', 'NPTCHW'
+ ]:
raise ValueError(
f'The input format {self.input_format} is invalid.')
- def transform(self, results: dict) -> dict:
+ def transform(self, results: Dict) -> Dict:
"""Performs the FormatShape formatting.
Args:
@@ -209,26 +226,69 @@ def transform(self, results: dict) -> dict:
"""
if not isinstance(results['imgs'], np.ndarray):
results['imgs'] = np.array(results['imgs'])
- imgs = results['imgs']
+
# [M x H x W x C]
# M = 1 * N_crops * N_clips * T
if self.collapse:
assert results['num_clips'] == 1
if self.input_format == 'NCTHW':
+ if 'imgs' in results:
+ imgs = results['imgs']
+ num_clips = results['num_clips']
+ clip_len = results['clip_len']
+ if isinstance(clip_len, dict):
+ clip_len = clip_len['RGB']
+
+ imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+ # N_crops x N_clips x T x H x W x C
+ imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))
+ # N_crops x N_clips x C x T x H x W
+ imgs = imgs.reshape((-1, ) + imgs.shape[2:])
+ # M' x C x T x H x W
+ # M' = N_crops x N_clips
+ results['imgs'] = imgs
+ results['input_shape'] = imgs.shape
+
+ if 'heatmap_imgs' in results:
+ imgs = results['heatmap_imgs']
+ num_clips = results['num_clips']
+ clip_len = results['clip_len']
+ # clip_len must be a dict
+ clip_len = clip_len['Pose']
+
+ imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+ # N_crops x N_clips x T x C x H x W
+ imgs = np.transpose(imgs, (0, 1, 3, 2, 4, 5))
+ # N_crops x N_clips x C x T x H x W
+ imgs = imgs.reshape((-1, ) + imgs.shape[2:])
+ # M' x C x T x H x W
+ # M' = N_crops x N_clips
+ results['heatmap_imgs'] = imgs
+ results['heatmap_input_shape'] = imgs.shape
+
+ elif self.input_format == 'NCTHW_Heatmap':
num_clips = results['num_clips']
clip_len = results['clip_len']
+ imgs = results['imgs']
imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
- # N_crops x N_clips x T x H x W x C
- imgs = np.transpose(imgs, (0, 1, 5, 2, 3, 4))
+ # N_crops x N_clips x T x C x H x W
+ imgs = np.transpose(imgs, (0, 1, 3, 2, 4, 5))
# N_crops x N_clips x C x T x H x W
imgs = imgs.reshape((-1, ) + imgs.shape[2:])
# M' x C x T x H x W
# M' = N_crops x N_clips
+ results['imgs'] = imgs
+ results['input_shape'] = imgs.shape
+
elif self.input_format == 'NCHW':
+ imgs = results['imgs']
imgs = np.transpose(imgs, (0, 3, 1, 2))
# M x C x H x W
+ results['imgs'] = imgs
+ results['input_shape'] = imgs.shape
+
elif self.input_format == 'NCHW_Flow':
num_imgs = len(results['imgs'])
assert num_imgs % 2 == 0
@@ -252,26 +312,31 @@ def transform(self, results: dict) -> dict:
# M' x C' x H x W
# M' = N_crops x N_clips
# C' = T x C
+ results['imgs'] = imgs
+ results['input_shape'] = imgs.shape
+
elif self.input_format == 'NPTCHW':
num_proposals = results['num_proposals']
num_clips = results['num_clips']
clip_len = results['clip_len']
+ imgs = results['imgs']
imgs = imgs.reshape((num_proposals, num_clips * clip_len) +
imgs.shape[1:])
# P x M x H x W x C
# M = N_clips x T
imgs = np.transpose(imgs, (0, 1, 4, 2, 3))
# P x M x C x H x W
+ results['imgs'] = imgs
+ results['input_shape'] = imgs.shape
if self.collapse:
- assert imgs.shape[0] == 1
- imgs = imgs.squeeze(0)
+ assert results['imgs'].shape[0] == 1
+ results['imgs'] = results['imgs'].squeeze(0)
+ results['input_shape'] = results['imgs'].shape
- results['imgs'] = imgs
- results['input_shape'] = imgs.shape
return results
- def __repr__(self):
+ def __repr__(self) -> str:
repr_str = self.__class__.__name__
repr_str += f"(input_format='{self.input_format}')"
return repr_str
diff --git a/mmaction/datasets/transforms/loading.py b/mmaction/datasets/transforms/loading.py
index 8305a490b8..10309b2516 100644
--- a/mmaction/datasets/transforms/loading.py
+++ b/mmaction/datasets/transforms/loading.py
@@ -4,7 +4,7 @@
import os
import os.path as osp
import shutil
-from typing import Optional, Union
+from typing import Dict, List, Optional, Union
import mmcv
import numpy as np
@@ -356,7 +356,7 @@ def transform(self, results: dict) -> dict:
total_frames = results['total_frames']
# if can't get fps, same value of `fps` and `target_fps`
# will perform nothing
- fps = results.get('fps')
+ fps = results.get('avg_fps')
if self.target_fps is None or not fps:
fps_scale_ratio = 1.0
else:
@@ -1077,29 +1077,35 @@ class DecordInit(BaseTransform):
Decord: https://github.com/dmlc/decord
- Required keys are "filename",
- added or modified keys are "video_reader" and "total_frames".
+ Required Keys:
+
+ - filename
+
+ Added Keys:
+
+ - video_reader
+ - total_frames
+ - fps
Args:
io_backend (str): io backend where frames are store.
- Default: 'disk'.
- num_threads (int): Number of thread to decode the video. Default: 1.
+ Defaults to ``'disk'``.
+ num_threads (int): Number of thread to decode the video. Defaults to 1.
kwargs (dict): Args for file client.
"""
- def __init__(self, io_backend='disk', num_threads=1, **kwargs):
+ def __init__(self,
+ io_backend: str = 'disk',
+ num_threads: int = 1,
+ **kwargs) -> None:
self.io_backend = io_backend
self.num_threads = num_threads
self.kwargs = kwargs
self.file_client = None
- def transform(self, results):
- """Perform the Decord initialization.
-
- Args:
- results (dict): The resulting dict to be modified and passed
- to the next transform in pipeline.
- """
+ def _get_video_reader(self, filename: str) -> object:
+ if osp.splitext(filename)[0] == filename:
+ filename = filename + '.mp4'
try:
import decord
except ImportError:
@@ -1108,15 +1114,27 @@ def transform(self, results):
if self.file_client is None:
self.file_client = FileClient(self.io_backend, **self.kwargs)
-
- file_obj = io.BytesIO(self.file_client.get(results['filename']))
+ file_obj = io.BytesIO(self.file_client.get(filename))
container = decord.VideoReader(file_obj, num_threads=self.num_threads)
- results['fps'] = container.get_avg_fps()
- results['video_reader'] = container
+ return container
+
+ def transform(self, results: Dict) -> Dict:
+ """Perform the Decord initialization.
+
+ Args:
+ results (dict): The result dict.
+
+ Returns:
+ dict: The result dict.
+ """
+ container = self._get_video_reader(results['filename'])
results['total_frames'] = len(container)
+
+ results['video_reader'] = container
+ results['avg_fps'] = container.get_avg_fps()
return results
- def __repr__(self):
+ def __repr__(self) -> str:
repr_str = (f'{self.__class__.__name__}('
f'io_backend={self.io_backend}, '
f'num_threads={self.num_threads})')
@@ -1129,35 +1147,32 @@ class DecordDecode(BaseTransform):
Decord: https://github.com/dmlc/decord
- Required keys are "video_reader", "filename" and "frame_inds",
- added or modified keys are "imgs" and "original_shape".
+ Required Keys:
+
+ - video_reader
+ - frame_inds
+
+ Added Keys:
+
+ - imgs
+ - original_shape
+ - img_shape
Args:
mode (str): Decoding mode. Options are 'accurate' and 'efficient'.
If set to 'accurate', it will decode videos into accurate frames.
If set to 'efficient', it will adopt fast seeking but only return
key frames, which may be duplicated and inaccurate, and more
- suitable for large scene-based video datasets. Default: 'accurate'.
+ suitable for large scene-based video datasets.
+ Defaults to ``'accurate'``.
"""
- def __init__(self, mode='accurate'):
+ def __init__(self, mode: str = 'accurate') -> None:
self.mode = mode
assert mode in ['accurate', 'efficient']
- def transform(self, results):
- """Perform the Decord decoding.
-
- Args:
- results (dict): The resulting dict to be modified and passed
- to the next transform in pipeline.
- """
- container = results['video_reader']
-
- if results['frame_inds'].ndim != 1:
- results['frame_inds'] = np.squeeze(results['frame_inds'])
-
- frame_inds = results['frame_inds']
-
+ def _decord_load_frames(self, container: object,
+ frame_inds: np.ndarray) -> List[np.ndarray]:
if self.mode == 'accurate':
imgs = container.get_batch(frame_inds).asnumpy()
imgs = list(imgs)
@@ -1169,6 +1184,24 @@ def transform(self, results):
container.seek(idx)
frame = container.next()
imgs.append(frame.asnumpy())
+ return imgs
+
+ def transform(self, results: Dict) -> Dict:
+ """Perform the Decord decoding.
+
+ Args:
+ results (dict): The result dict.
+
+ Returns:
+ dict: The result dict.
+ """
+ container = results['video_reader']
+
+ if results['frame_inds'].ndim != 1:
+ results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+ frame_inds = results['frame_inds']
+ imgs = self._decord_load_frames(container, frame_inds)
results['video_reader'] = None
del container
@@ -1179,7 +1212,7 @@ def transform(self, results):
return results
- def __repr__(self):
+ def __repr__(self) -> str:
repr_str = f'{self.__class__.__name__}(mode={self.mode})'
return repr_str
diff --git a/mmaction/datasets/transforms/pose_transforms.py b/mmaction/datasets/transforms/pose_transforms.py
index 1740a18575..0abb987551 100644
--- a/mmaction/datasets/transforms/pose_transforms.py
+++ b/mmaction/datasets/transforms/pose_transforms.py
@@ -1,7 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
import copy as cp
import pickle
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from mmcv.transforms import BaseTransform, KeyMapper
@@ -11,7 +11,8 @@
from torch.nn.modules.utils import _pair
from mmaction.registry import TRANSFORMS
-from .processing import Flip, _combine_quadruple
+from .loading import DecordDecode, DecordInit
+from .processing import _combine_quadruple
@TRANSFORMS.register_module()
@@ -172,42 +173,65 @@ def __repr__(self):
class GeneratePoseTarget(BaseTransform):
"""Generate pseudo heatmaps based on joint coordinates and confidence.
- Required keys are "keypoint", "img_shape", "keypoint_score" (optional),
- added or modified keys are "imgs".
+ Required Keys:
+
+ - keypoint
+ - keypoint_score (optional)
+ - img_shape
+
+ Added Keys:
+
+ - imgs (optional)
+ - heatmap_imgs (optional)
Args:
- sigma (float): The sigma of the generated gaussian map. Default: 0.6.
+ sigma (float): The sigma of the generated gaussian map.
+ Defaults to 0.6.
use_score (bool): Use the confidence score of keypoints as the maximum
- of the gaussian maps. Default: True.
- with_kp (bool): Generate pseudo heatmaps for keypoints. Default: True.
+ of the gaussian maps. Defaults to True.
+ with_kp (bool): Generate pseudo heatmaps for keypoints.
+ Defaults to True.
with_limb (bool): Generate pseudo heatmaps for limbs. At least one of
- 'with_kp' and 'with_limb' should be True. Default: False.
+ 'with_kp' and 'with_limb' should be True. Defaults to False.
skeletons (tuple[tuple]): The definition of human skeletons.
- Default: ((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7), (7, 9),
- (0, 6), (6, 8), (8, 10), (5, 11), (11, 13), (13, 15),
- (6, 12), (12, 14), (14, 16), (11, 12)),
+ Defaults to ``((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7),
+ (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13),
+ (13, 15), (6, 12), (12, 14), (14, 16), (11, 12))``,
which is the definition of COCO-17p skeletons.
double (bool): Output both original heatmaps and flipped heatmaps.
- Default: False.
+ Defaults to False.
left_kp (tuple[int]): Indexes of left keypoints, which is used when
- flipping heatmaps. Default: (1, 3, 5, 7, 9, 11, 13, 15),
+ flipping heatmaps. Defaults to (1, 3, 5, 7, 9, 11, 13, 15),
which is left keypoints in COCO-17p.
right_kp (tuple[int]): Indexes of right keypoints, which is used when
- flipping heatmaps. Default: (2, 4, 6, 8, 10, 12, 14, 16),
+ flipping heatmaps. Defaults to (2, 4, 6, 8, 10, 12, 14, 16),
which is right keypoints in COCO-17p.
+ left_limb (tuple[int]): Indexes of left limbs, which is used when
+ flipping heatmaps. Defaults to (0, 2, 4, 5, 6, 10, 11, 12),
+ which is left limbs of skeletons we defined for COCO-17p.
+ right_limb (tuple[int]): Indexes of right limbs, which is used when
+ flipping heatmaps. Defaults to (1, 3, 7, 8, 9, 13, 14, 15),
+ which is right limbs of skeletons we defined for COCO-17p.
+ scaling (float): The ratio to scale the heatmaps. Defaults to 1.
"""
def __init__(self,
- sigma=0.6,
- use_score=True,
- with_kp=True,
- with_limb=False,
- skeletons=((0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (5, 7),
- (7, 9), (0, 6), (6, 8), (8, 10), (5, 11), (11, 13),
- (13, 15), (6, 12), (12, 14), (14, 16), (11, 12)),
- double=False,
- left_kp=(1, 3, 5, 7, 9, 11, 13, 15),
- right_kp=(2, 4, 6, 8, 10, 12, 14, 16)):
+ sigma: float = 0.6,
+ use_score: bool = True,
+ with_kp: bool = True,
+ with_limb: bool = False,
+ skeletons: Tuple[Tuple[int]] = ((0, 1), (0, 2), (1, 3),
+ (2, 4), (0, 5), (5, 7),
+ (7, 9), (0, 6), (6, 8),
+ (8, 10), (5, 11), (11, 13),
+ (13, 15), (6, 12), (12, 14),
+ (14, 16), (11, 12)),
+ double: bool = False,
+ left_kp: Tuple[int] = (1, 3, 5, 7, 9, 11, 13, 15),
+ right_kp: Tuple[int] = (2, 4, 6, 8, 10, 12, 14, 16),
+ left_limb: Tuple[int] = (0, 2, 4, 5, 6, 10, 11, 12),
+ right_limb: Tuple[int] = (1, 3, 7, 8, 9, 13, 14, 15),
+ scaling: float = 1.) -> None:
self.sigma = sigma
self.use_score = use_score
@@ -224,29 +248,30 @@ def __init__(self,
self.left_kp = left_kp
self.right_kp = right_kp
self.skeletons = skeletons
+ self.left_limb = left_limb
+ self.right_limb = right_limb
+ self.scaling = scaling
- def generate_a_heatmap(self, img_h, img_w, centers, sigma, max_values):
+ def generate_a_heatmap(self, arr: np.ndarray, centers: np.ndarray,
+ max_values: np.ndarray) -> None:
"""Generate pseudo heatmap for one keypoint in one frame.
Args:
- img_h (int): The height of the heatmap.
- img_w (int): The width of the heatmap.
+ arr (np.ndarray): The array to store the generated heatmaps.
+ Shape: img_h * img_w.
centers (np.ndarray): The coordinates of corresponding keypoints
- (of multiple persons).
- sigma (float): The sigma of generated gaussian.
- max_values (np.ndarray): The max values of each keypoint.
-
- Returns:
- np.ndarray: The generated pseudo heatmap.
+ (of multiple persons). Shape: M * 2.
+ max_values (np.ndarray): The max values of each keypoint. Shape: M.
"""
- heatmap = np.zeros([img_h, img_w], dtype=np.float32)
+ sigma = self.sigma
+ img_h, img_w = arr.shape
for center, max_value in zip(centers, max_values):
- mu_x, mu_y = center[0], center[1]
if max_value < self.eps:
continue
+ mu_x, mu_y = center[0], center[1]
st_x = max(int(mu_x - 3 * sigma), 0)
ed_x = min(int(mu_x + 3 * sigma) + 1, img_w)
st_y = max(int(mu_y - 3 * sigma), 0)
@@ -261,34 +286,29 @@ def generate_a_heatmap(self, img_h, img_w, centers, sigma, max_values):
patch = np.exp(-((x - mu_x)**2 + (y - mu_y)**2) / 2 / sigma**2)
patch = patch * max_value
- heatmap[st_y:ed_y,
- st_x:ed_x] = np.maximum(heatmap[st_y:ed_y, st_x:ed_x],
- patch)
-
- return heatmap
+ arr[st_y:ed_y, st_x:ed_x] = \
+ np.maximum(arr[st_y:ed_y, st_x:ed_x], patch)
- def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma,
- start_values, end_values):
+ def generate_a_limb_heatmap(self, arr: np.ndarray, starts: np.ndarray,
+ ends: np.ndarray, start_values: np.ndarray,
+ end_values: np.ndarray) -> None:
"""Generate pseudo heatmap for one limb in one frame.
Args:
- img_h (int): The height of the heatmap.
- img_w (int): The width of the heatmap.
+ arr (np.ndarray): The array to store the generated heatmaps.
+ Shape: img_h * img_w.
starts (np.ndarray): The coordinates of one keypoint in the
- corresponding limbs (of multiple persons).
+ corresponding limbs. Shape: M * 2.
ends (np.ndarray): The coordinates of the other keypoint in the
- corresponding limbs (of multiple persons).
- sigma (float): The sigma of generated gaussian.
+ corresponding limbs. Shape: M * 2.
start_values (np.ndarray): The max values of one keypoint in the
- corresponding limbs.
- end_values (np.ndarray): The max values of the other keypoint in
- the corresponding limbs.
-
- Returns:
- np.ndarray: The generated pseudo heatmap.
+ corresponding limbs. Shape: M.
+ end_values (np.ndarray): The max values of the other keypoint
+ in the corresponding limbs. Shape: M.
"""
- heatmap = np.zeros([img_h, img_w], dtype=np.float32)
+ sigma = self.sigma
+ img_h, img_w = arr.shape
for start, end, start_value, end_value in zip(starts, ends,
start_values,
@@ -325,9 +345,7 @@ def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma,
d2_ab = ((start[0] - end[0])**2 + (start[1] - end[1])**2)
if d2_ab < 1:
- full_map = self.generate_a_heatmap(img_h, img_w, [start],
- sigma, [start_value])
- heatmap = np.maximum(heatmap, full_map)
+ self.generate_a_heatmap(arr, start[None], start_value[None])
continue
coeff = (d2_start - d2_end + d2_ab) / 2. / d2_ab
@@ -348,61 +366,50 @@ def generate_a_limb_heatmap(self, img_h, img_w, starts, ends, sigma,
patch = np.exp(-d2_seg / 2. / sigma**2)
patch = patch * value_coeff
- heatmap[min_y:max_y, min_x:max_x] = np.maximum(
- heatmap[min_y:max_y, min_x:max_x], patch)
-
- return heatmap
+ arr[min_y:max_y, min_x:max_x] = \
+ np.maximum(arr[min_y:max_y, min_x:max_x], patch)
- def generate_heatmap(self, img_h, img_w, kps, sigma, max_values):
+ def generate_heatmap(self, arr: np.ndarray, kps: np.ndarray,
+ max_values: np.ndarray) -> None:
"""Generate pseudo heatmap for all keypoints and limbs in one frame (if
needed).
Args:
- img_h (int): The height of the heatmap.
- img_w (int): The width of the heatmap.
+ arr (np.ndarray): The array to store the generated heatmaps.
+ Shape: V * img_h * img_w.
kps (np.ndarray): The coordinates of keypoints in this frame.
- sigma (float): The sigma of generated gaussian.
+ Shape: M * V * 2.
max_values (np.ndarray): The confidence score of each keypoint.
-
- Returns:
- np.ndarray: The generated pseudo heatmap.
+ Shape: M * V.
"""
- heatmaps = []
if self.with_kp:
num_kp = kps.shape[1]
for i in range(num_kp):
- heatmap = self.generate_a_heatmap(img_h, img_w, kps[:, i],
- sigma, max_values[:, i])
- heatmaps.append(heatmap)
+ self.generate_a_heatmap(arr[i], kps[:, i], max_values[:, i])
if self.with_limb:
- for limb in self.skeletons:
+ for i, limb in enumerate(self.skeletons):
start_idx, end_idx = limb
starts = kps[:, start_idx]
ends = kps[:, end_idx]
start_values = max_values[:, start_idx]
end_values = max_values[:, end_idx]
- heatmap = self.generate_a_limb_heatmap(img_h, img_w, starts,
- ends, sigma,
- start_values,
- end_values)
- heatmaps.append(heatmap)
-
- return np.stack(heatmaps, axis=-1)
+ self.generate_a_limb_heatmap(arr[i], starts, ends,
+ start_values, end_values)
- def gen_an_aug(self, results):
+ def gen_an_aug(self, results: Dict) -> np.ndarray:
"""Generate pseudo heatmaps for all frames.
Args:
results (dict): The dictionary that contains all info of a sample.
Returns:
- list[np.ndarray]: The generated pseudo heatmaps.
+ np.ndarray: The generated pseudo heatmaps.
"""
- all_kps = results['keypoint']
+ all_kps = results['keypoint'].astype(np.float32)
kp_shape = all_kps.shape
if 'keypoint_score' in results:
@@ -411,43 +418,54 @@ def gen_an_aug(self, results):
all_kpscores = np.ones(kp_shape[:-1], dtype=np.float32)
img_h, img_w = results['img_shape']
+
+ # scale img_h, img_w and kps
+ img_h = int(img_h * self.scaling + 0.5)
+ img_w = int(img_w * self.scaling + 0.5)
+ all_kps[..., :2] *= self.scaling
+
num_frame = kp_shape[1]
+ num_c = 0
+ if self.with_kp:
+ num_c += all_kps.shape[2]
+ if self.with_limb:
+ num_c += len(self.skeletons)
+
+ ret = np.zeros([num_frame, num_c, img_h, img_w], dtype=np.float32)
- imgs = []
for i in range(num_frame):
- sigma = self.sigma
+ # M, V, C
kps = all_kps[:, i]
- kpscores = all_kpscores[:, i]
-
- max_values = np.ones(kpscores.shape, dtype=np.float32)
- if self.use_score:
- max_values = kpscores
-
- hmap = self.generate_heatmap(img_h, img_w, kps, sigma, max_values)
- imgs.append(hmap)
+ # M, C
+ kpscores = all_kpscores[:, i] if self.use_score else \
+ np.ones_like(all_kpscores[:, i])
- return imgs
+ self.generate_heatmap(ret[i], kps, kpscores)
+ return ret
- def transform(self, results):
+ def transform(self, results: Dict) -> Dict:
"""Generate pseudo heatmaps based on joint coordinates and confidence.
Args:
results (dict): The resulting dict to be modified and passed
to the next transform in pipeline.
"""
- if not self.double:
- results['imgs'] = np.stack(self.gen_an_aug(results))
- else:
- results_ = cp.deepcopy(results)
- flip = Flip(
- flip_ratio=1, left_kp=self.left_kp, right_kp=self.right_kp)
- results_ = flip(results_)
- results['imgs'] = np.concatenate(
- [self.gen_an_aug(results),
- self.gen_an_aug(results_)])
+ heatmap = self.gen_an_aug(results)
+ key = 'heatmap_imgs' if 'imgs' in results else 'imgs'
+
+ if self.double:
+ indices = np.arange(heatmap.shape[1], dtype=np.int64)
+ left, right = (self.left_kp, self.right_kp) if self.with_kp else (
+ self.left_limb, self.right_limb)
+ for l, r in zip(left, right): # noqa: E741
+ indices[l] = r
+ indices[r] = l
+ heatmap_flip = heatmap[..., ::-1][:, indices]
+ heatmap = np.concatenate([heatmap, heatmap_flip])
+ results[key] = heatmap
return results
- def __repr__(self):
+ def __repr__(self) -> str:
repr_str = (f'{self.__class__.__name__}('
f'sigma={self.sigma}, '
f'use_score={self.use_score}, '
@@ -456,7 +474,10 @@ def __repr__(self):
f'skeletons={self.skeletons}, '
f'double={self.double}, '
f'left_kp={self.left_kp}, '
- f'right_kp={self.right_kp})')
+ f'right_kp={self.right_kp}, '
+ f'left_limb={self.left_limb}, '
+ f'right_limb={self.right_limb}, '
+ f'scaling={self.scaling})')
return repr_str
@@ -468,30 +489,38 @@ class PoseCompact(BaseTransform):
example, if 'padding == 0.25', then the expanded box has unchanged center,
and 1.25x width and height.
- Required keys in results are "img_shape", "keypoint", add or modified keys
- are "img_shape", "keypoint", "crop_quadruple".
+ Required Keys:
+
+ - keypoint
+ - img_shape
+
+ Modified Keys:
+
+ - img_shape
+ - keypoint
+
+ Added Keys:
+
+ - crop_quadruple
Args:
- padding (float): The padding size. Default: 0.25.
+ padding (float): The padding size. Defaults to 0.25.
threshold (int): The threshold for the tight bounding box. If the width
or height of the tight bounding box is smaller than the threshold,
- we do not perform the compact operation. Default: 10.
+ we do not perform the compact operation. Defaults to 10.
hw_ratio (float | tuple[float] | None): The hw_ratio of the expanded
box. Float indicates the specific ratio and tuple indicates a
ratio range. If set as None, it means there is no requirement on
- hw_ratio. Default: None.
+ hw_ratio. Defaults to None.
allow_imgpad (bool): Whether to allow expanding the box outside the
- image to meet the hw_ratio requirement. Default: True.
-
- Returns:
- type: Description of returned object.
+ image to meet the hw_ratio requirement. Defaults to True.
"""
def __init__(self,
- padding=0.25,
- threshold=10,
- hw_ratio=None,
- allow_imgpad=True):
+ padding: float = 0.25,
+ threshold: int = 10,
+ hw_ratio: Optional[Union[float, Tuple[float]]] = None,
+ allow_imgpad: bool = True) -> None:
self.padding = padding
self.threshold = threshold
@@ -503,7 +532,7 @@ def __init__(self,
self.allow_imgpad = allow_imgpad
assert self.padding >= 0
- def transform(self, results):
+ def transform(self, results: Dict) -> Dict:
"""Convert the coordinates of keypoints to make it more compact.
Args:
@@ -561,7 +590,7 @@ def transform(self, results):
results['crop_quadruple'] = crop_quadruple
return results
- def __repr__(self):
+ def __repr__(self) -> str:
repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '
f'threshold={self.threshold}, '
f'hw_ratio={self.hw_ratio}, '
@@ -1157,7 +1186,7 @@ def transform(self, results: Dict) -> Dict:
transitional[i] = transitional[i - 1] = True
if num_persons[i] != num_persons[i + 1]:
transitional[i] = transitional[i + 1] = True
- inds_int = inds.astype(np.int)
+ inds_int = inds.astype(np.int64)
coeff = np.array([transitional[i] for i in inds_int])
inds = (coeff * inds_int + (1 - coeff) * inds).astype(np.float32)
@@ -1167,7 +1196,7 @@ def transform(self, results: Dict) -> Dict:
results['num_clips'] = self.num_clips
return results
- def __repr__(self):
+ def __repr__(self) -> str:
repr_str = (f'{self.__class__.__name__}('
f'clip_len={self.clip_len}, '
f'num_clips={self.num_clips}, '
@@ -1253,6 +1282,17 @@ class PoseDecode(BaseTransform):
- keypoint_score (optional)
"""
+ @staticmethod
+ def _load_kp(kp: np.ndarray, frame_inds: np.ndarray) -> np.ndarray:
+ """Load keypoints according to sampled indexes."""
+ return kp[:, frame_inds].astype(np.float32)
+
+ @staticmethod
+ def _load_kpscore(kpscore: np.ndarray,
+ frame_inds: np.ndarray) -> np.ndarray:
+ """Load keypoint scores according to sampled indexes."""
+ return kpscore[:, frame_inds].astype(np.float32)
+
def transform(self, results: Dict) -> Dict:
"""The transform function of :class:`PoseDecode`.
@@ -1274,16 +1314,256 @@ def transform(self, results: Dict) -> Dict:
offset = results.get('offset', 0)
frame_inds = results['frame_inds'] + offset
- results['keypoint'] = results['keypoint'][:, frame_inds].astype(
- np.float32)
-
if 'keypoint_score' in results:
- kpscore = results['keypoint_score']
- results['keypoint_score'] = kpscore[:,
- frame_inds].astype(np.float32)
+ results['keypoint_score'] = self._load_kpscore(
+ results['keypoint_score'], frame_inds)
+
+ results['keypoint'] = self._load_kp(results['keypoint'], frame_inds)
return results
def __repr__(self) -> str:
repr_str = f'{self.__class__.__name__}()'
return repr_str
+
+
+@TRANSFORMS.register_module()
+class MMUniformSampleFrames(UniformSampleFrames):
+ """Uniformly sample frames from the multi-modal data."""
+
+ def transform(self, results: Dict) -> Dict:
+ """The transform function of :class:`MMUniformSampleFrames`.
+
+ Args:
+ results (dict): The result dict.
+
+ Returns:
+ dict: The result dict.
+ """
+ num_frames = results['total_frames']
+ modalities = []
+ for modality, clip_len in self.clip_len.items():
+ if self.test_mode:
+ inds = self._get_test_clips(num_frames, clip_len)
+ else:
+ inds = self._get_train_clips(num_frames, clip_len)
+ inds = np.mod(inds, num_frames)
+ results[f'{modality}_inds'] = inds.astype(np.int32)
+ modalities.append(modality)
+ results['clip_len'] = self.clip_len
+ results['frame_interval'] = None
+ results['num_clips'] = self.num_clips
+ if not isinstance(results['modality'], list):
+ # should override
+ results['modality'] = modalities
+ return results
+
+
+@TRANSFORMS.register_module()
+class MMDecode(DecordInit, DecordDecode, PoseDecode):
+ """Decode RGB videos and skeletons."""
+
+ def __init__(self, io_backend: str = 'disk', **kwargs) -> None:
+ DecordInit.__init__(self, io_backend=io_backend, **kwargs)
+ DecordDecode.__init__(self)
+ self.io_backend = io_backend
+ self.kwargs = kwargs
+ self.file_client = None
+
+ def transform(self, results: Dict) -> Dict:
+ """The transform function of :class:`MMDecode`.
+
+ Args:
+ results (dict): The result dict.
+
+ Returns:
+ dict: The result dict.
+ """
+ for mod in results['modality']:
+ if results[f'{mod}_inds'].ndim != 1:
+ results[f'{mod}_inds'] = np.squeeze(results[f'{mod}_inds'])
+ frame_inds = results[f'{mod}_inds']
+ if mod == 'RGB':
+ if 'filename' not in results:
+ results['filename'] = results['frame_dir'] + '.mp4'
+ video_reader = self._get_video_reader(results['filename'])
+ imgs = self._decord_load_frames(video_reader, frame_inds)
+ del video_reader
+ results['imgs'] = imgs
+ elif mod == 'Pose':
+ assert 'keypoint' in results
+ if 'keypoint_score' not in results:
+ keypoint_score = [
+ np.ones(keypoint.shape[:-1], dtype=np.float32)
+ for keypoint in results['keypoint']
+ ]
+ results['keypoint_score'] = np.stack(keypoint_score)
+ results['keypoint'] = self._load_kp(results['keypoint'],
+ frame_inds)
+ results['keypoint_score'] = self._load_kpscore(
+ results['keypoint_score'], frame_inds)
+ else:
+ raise NotImplementedError(
+ f'MMDecode: Modality {mod} not supported')
+
+ # We need to scale human keypoints to the new image size
+ if 'imgs' in results and 'keypoint' in results:
+ real_img_shape = results['imgs'][0].shape[:2]
+ if real_img_shape != results['img_shape']:
+ oh, ow = results['img_shape']
+ nh, nw = real_img_shape
+
+ assert results['keypoint'].shape[-1] in [2, 3]
+ results['keypoint'][..., 0] *= (nw / ow)
+ results['keypoint'][..., 1] *= (nh / oh)
+ results['img_shape'] = real_img_shape
+ results['original_shape'] = real_img_shape
+
+ return results
+
+ def __repr__(self) -> str:
+ repr_str = (f'{self.__class__.__name__}('
+ f'io_backend={self.io_backend})')
+ return repr_str
+
+
+@TRANSFORMS.register_module()
+class MMCompact(BaseTransform):
+ """Convert the coordinates of keypoints and crop the images to make them
+ more compact.
+
+ Required Keys:
+
+ - imgs
+ - keypoint
+ - img_shape
+
+ Modified Keys:
+
+ - imgs
+ - keypoint
+ - img_shape
+
+ Args:
+ padding (float): The padding size. Defaults to 0.25.
+ threshold (int): The threshold for the tight bounding box. If the width
+ or height of the tight bounding box is smaller than the threshold,
+ we do not perform the compact operation. Defaults to 10.
+ hw_ratio (float | tuple[float]): The hw_ratio of the expanded
+ box. Float indicates the specific ratio and tuple indicates a
+ ratio range. If set as None, it means there is no requirement on
+ hw_ratio. Defaults to 1.
+ allow_imgpad (bool): Whether to allow expanding the box outside the
+ image to meet the hw_ratio requirement. Defaults to True.
+ """
+
+ def __init__(self,
+ padding: float = 0.25,
+ threshold: int = 10,
+ hw_ratio: Union[float, Tuple[float]] = 1,
+ allow_imgpad: bool = True) -> None:
+
+ self.padding = padding
+ self.threshold = threshold
+ if hw_ratio is not None:
+ hw_ratio = _pair(hw_ratio)
+ self.hw_ratio = hw_ratio
+ self.allow_imgpad = allow_imgpad
+ assert self.padding >= 0
+
+ def _get_box(self, keypoint: np.ndarray, img_shape: Tuple[int]) -> Tuple:
+ """Calculate the bounding box surrounding all joints in the frames."""
+ h, w = img_shape
+
+ kp_x = keypoint[..., 0]
+ kp_y = keypoint[..., 1]
+
+ min_x = np.min(kp_x[kp_x != 0], initial=np.Inf)
+ min_y = np.min(kp_y[kp_y != 0], initial=np.Inf)
+ max_x = np.max(kp_x[kp_x != 0], initial=-np.Inf)
+ max_y = np.max(kp_y[kp_y != 0], initial=-np.Inf)
+
+ # The compact area is too small
+ if max_x - min_x < self.threshold or max_y - min_y < self.threshold:
+ return 0, 0, w, h
+
+ center = ((max_x + min_x) / 2, (max_y + min_y) / 2)
+ half_width = (max_x - min_x) / 2 * (1 + self.padding)
+ half_height = (max_y - min_y) / 2 * (1 + self.padding)
+
+ if self.hw_ratio is not None:
+ half_height = max(self.hw_ratio[0] * half_width, half_height)
+ half_width = max(1 / self.hw_ratio[1] * half_height, half_width)
+
+ min_x, max_x = center[0] - half_width, center[0] + half_width
+ min_y, max_y = center[1] - half_height, center[1] + half_height
+
+ # hot update
+ if not self.allow_imgpad:
+ min_x, min_y = int(max(0, min_x)), int(max(0, min_y))
+ max_x, max_y = int(min(w, max_x)), int(min(h, max_y))
+ else:
+ min_x, min_y = int(min_x), int(min_y)
+ max_x, max_y = int(max_x), int(max_y)
+ return min_x, min_y, max_x, max_y
+
+ def _compact_images(self, imgs: List[np.ndarray], img_shape: Tuple[int],
+ box: Tuple[int]) -> List:
+ """Crop the images acoordding the bounding box."""
+ h, w = img_shape
+ min_x, min_y, max_x, max_y = box
+ pad_l, pad_u, pad_r, pad_d = 0, 0, 0, 0
+ if min_x < 0:
+ pad_l = -min_x
+ min_x, max_x = 0, max_x + pad_l
+ w += pad_l
+ if min_y < 0:
+ pad_u = -min_y
+ min_y, max_y = 0, max_y + pad_u
+ h += pad_u
+ if max_x > w:
+ pad_r = max_x - w
+ w = max_x
+ if max_y > h:
+ pad_d = max_y - h
+ h = max_y
+
+ if pad_l > 0 or pad_r > 0 or pad_u > 0 or pad_d > 0:
+ imgs = [
+ np.pad(img, ((pad_u, pad_d), (pad_l, pad_r), (0, 0)))
+ for img in imgs
+ ]
+ imgs = [img[min_y:max_y, min_x:max_x] for img in imgs]
+ return imgs
+
+ def transform(self, results: Dict) -> Dict:
+ """The transform function of :class:`MMCompact`.
+
+ Args:
+ results (dict): The result dict.
+
+ Returns:
+ dict: The result dict.
+ """
+ img_shape = results['img_shape']
+ kp = results['keypoint']
+ # Make NaN zero
+ kp[np.isnan(kp)] = 0.
+ min_x, min_y, max_x, max_y = self._get_box(kp, img_shape)
+
+ kp_x, kp_y = kp[..., 0], kp[..., 1]
+ kp_x[kp_x != 0] -= min_x
+ kp_y[kp_y != 0] -= min_y
+
+ new_shape = (max_y - min_y, max_x - min_x)
+ results['img_shape'] = new_shape
+ results['imgs'] = self._compact_images(results['imgs'], img_shape,
+ (min_x, min_y, max_x, max_y))
+ return results
+
+ def __repr__(self) -> str:
+ repr_str = (f'{self.__class__.__name__}(padding={self.padding}, '
+ f'threshold={self.threshold}, '
+ f'hw_ratio={self.hw_ratio}, '
+ f'allow_imgpad={self.allow_imgpad})')
+ return repr_str
diff --git a/mmaction/evaluation/functional/accuracy.py b/mmaction/evaluation/functional/accuracy.py
index 4b7f6dd52a..aa28bd486b 100644
--- a/mmaction/evaluation/functional/accuracy.py
+++ b/mmaction/evaluation/functional/accuracy.py
@@ -166,7 +166,7 @@ def mmit_mean_average_precision(scores, labels):
sample.
Returns:
- np.float: The MMIT style mean average precision.
+ np.float64: The MMIT style mean average precision.
"""
results = []
for score, label in zip(scores, labels):
@@ -186,7 +186,7 @@ def mean_average_precision(scores, labels):
sample.
Returns:
- np.float: The mean average precision.
+ np.float64: The mean average precision.
"""
results = []
scores = np.stack(scores).T
diff --git a/mmaction/evaluation/functional/ava_evaluation/metrics.py b/mmaction/evaluation/functional/ava_evaluation/metrics.py
index 4d566accb5..ffbe589454 100644
--- a/mmaction/evaluation/functional/ava_evaluation/metrics.py
+++ b/mmaction/evaluation/functional/ava_evaluation/metrics.py
@@ -35,7 +35,7 @@ def compute_precision_recall(scores, labels, num_gt):
instances. This value is None if no ground truth labels are
present.
"""
- if (not isinstance(labels, np.ndarray) or labels.dtype != np.bool
+ if (not isinstance(labels, np.ndarray) or labels.dtype != bool
or len(labels.shape) != 1):
raise ValueError('labels must be single dimension bool numpy array')
@@ -90,7 +90,7 @@ def compute_average_precision(precision, recall):
if not isinstance(precision, np.ndarray) or not isinstance(
recall, np.ndarray):
raise ValueError('precision and recall must be numpy array')
- if precision.dtype != np.float or recall.dtype != np.float:
+ if precision.dtype != np.float64 or recall.dtype != np.float64:
raise ValueError('input must be float numpy array.')
if len(precision) != len(recall):
raise ValueError('precision and recall must be of the same size.')
diff --git a/mmaction/evaluation/functional/ava_evaluation/object_detection_evaluation.py b/mmaction/evaluation/functional/ava_evaluation/object_detection_evaluation.py
deleted file mode 100644
index 1886521485..0000000000
--- a/mmaction/evaluation/functional/ava_evaluation/object_detection_evaluation.py
+++ /dev/null
@@ -1,574 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""object_detection_evaluation module.
-
-ObjectDetectionEvaluation is a class which manages ground truth information of
-a object detection dataset, and computes frequently used detection metrics such
-as Precision, Recall, CorLoc of the provided detection results.
-It supports the following operations:
-1) Add ground truth information of images sequentially.
-2) Add detection result of images sequentially.
-3) Evaluate detection metrics on already inserted detection results.
-4) Write evaluation result into a pickle file for future processing or
- visualization.
-
-Note: This module operates on numpy boxes and box lists.
-"""
-
-import collections
-import logging
-import warnings
-from abc import ABCMeta, abstractmethod
-from collections import defaultdict
-
-import numpy as np
-
-from . import metrics, per_image_evaluation, standard_fields
-
-
-class DetectionEvaluator:
- """Interface for object detection evaluation classes.
-
- Example usage of the Evaluator:
- ------------------------------
- evaluator = DetectionEvaluator(categories)
-
- # Detections and groundtruth for image 1.
- evaluator.add_single_groundtruth_image_info(...)
- evaluator.add_single_detected_image_info(...)
-
- # Detections and groundtruth for image 2.
- evaluator.add_single_groundtruth_image_info(...)
- evaluator.add_single_detected_image_info(...)
-
- metrics_dict = evaluator.evaluate()
- """
-
- __metaclass__ = ABCMeta
-
- def __init__(self, categories):
- """Constructor.
-
- Args:
- categories: A list of dicts, each of which has the following keys -
- 'id': (required) an integer id uniquely identifying this
- category.
- 'name': (required) string representing category name e.g.,
- 'cat', 'dog'.
- """
- self._categories = categories
-
- @abstractmethod
- def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
- """Adds groundtruth for a single image to be used for evaluation.
-
- Args:
- image_id: A unique string/integer identifier for the image.
- groundtruth_dict: A dictionary of groundtruth numpy arrays required
- for evaluations.
- """
-
- @abstractmethod
- def add_single_detected_image_info(self, image_id, detections_dict):
- """Adds detections for a single image to be used for evaluation.
-
- Args:
- image_id: A unique string/integer identifier for the image.
- detections_dict: A dictionary of detection numpy arrays required
- for evaluation.
- """
-
- @abstractmethod
- def evaluate(self):
- """Evaluates detections and returns a dictionary of metrics."""
-
- @abstractmethod
- def clear(self):
- """Clears the state to prepare for a fresh evaluation."""
-
-
-class ObjectDetectionEvaluator(DetectionEvaluator):
- """A class to evaluate detections."""
-
- def __init__(self,
- categories,
- matching_iou_threshold=0.5,
- evaluate_corlocs=False,
- metric_prefix=None,
- use_weighted_mean_ap=False,
- evaluate_masks=False):
- """Constructor.
-
- Args:
- categories: A list of dicts, each of which has the following keys -
- 'id': (required) an integer id uniquely identifying this
- category.
- 'name': (required) string representing category name e.g.,
- 'cat', 'dog'.
- matching_iou_threshold: IOU threshold to use for matching
- groundtruth boxes to detection boxes.
- evaluate_corlocs: (optional) boolean which determines if corloc
- scores are to be returned or not.
- metric_prefix: (optional) string prefix for metric name; if None,
- no prefix is used.
- use_weighted_mean_ap: (optional) boolean which determines if the
- mean average precision is computed directly from the scores and
- tp_fp_labels of all classes.
- evaluate_masks: If False, evaluation will be performed based on
- boxes. If True, mask evaluation will be performed instead.
-
- Raises:
- ValueError: If the category ids are not 1-indexed.
- """
- super(ObjectDetectionEvaluator, self).__init__(categories)
- self._num_classes = max([cat['id'] for cat in categories])
- if min(cat['id'] for cat in categories) < 1:
- raise ValueError('Classes should be 1-indexed.')
- self._matching_iou_threshold = matching_iou_threshold
- self._use_weighted_mean_ap = use_weighted_mean_ap
- self._label_id_offset = 1
- self._evaluate_masks = evaluate_masks
- self._evaluation = ObjectDetectionEvaluation(
- num_groundtruth_classes=self._num_classes,
- matching_iou_threshold=self._matching_iou_threshold,
- use_weighted_mean_ap=self._use_weighted_mean_ap,
- label_id_offset=self._label_id_offset,
- )
- self._image_ids = set([])
- self._evaluate_corlocs = evaluate_corlocs
- self._metric_prefix = (metric_prefix + '_') if metric_prefix else ''
-
- def add_single_ground_truth_image_info(self, image_id, groundtruth_dict):
- """Adds groundtruth for a single image to be used for evaluation.
-
- Args:
- image_id: A unique string/integer identifier for the image.
- groundtruth_dict: A dictionary containing -
- standard_fields.InputDataFields.groundtruth_boxes: float32
- numpy array of shape [num_boxes, 4] containing `num_boxes`
- groundtruth boxes of the format [ymin, xmin, ymax, xmax] in
- absolute image coordinates.
- standard_fields.InputDataFields.groundtruth_classes: integer
- numpy array of shape [num_boxes] containing 1-indexed
- groundtruth classes for the boxes.
- standard_fields.InputDataFields.groundtruth_instance_masks:
- Optional numpy array of shape [num_boxes, height, width]
- with values in {0, 1}.
-
- Raises:
- ValueError: On adding groundtruth for an image more than once. Will
- also raise error if instance masks are not in groundtruth
- dictionary.
- """
- if image_id in self._image_ids:
- raise ValueError(
- 'Image with id {} already added.'.format(image_id))
-
- groundtruth_classes = (
- groundtruth_dict[
- standard_fields.InputDataFields.groundtruth_classes] -
- self._label_id_offset)
-
- groundtruth_masks = None
- if self._evaluate_masks:
- if (standard_fields.InputDataFields.groundtruth_instance_masks
- not in groundtruth_dict):
- raise ValueError(
- 'Instance masks not in groundtruth dictionary.')
- groundtruth_masks = groundtruth_dict[
- standard_fields.InputDataFields.groundtruth_instance_masks]
- self._evaluation.add_single_ground_truth_image_info(
- image_key=image_id,
- groundtruth_boxes=groundtruth_dict[
- standard_fields.InputDataFields.groundtruth_boxes],
- groundtruth_class_labels=groundtruth_classes,
- groundtruth_masks=groundtruth_masks,
- )
- self._image_ids.update([image_id])
-
- def add_single_detected_image_info(self, image_id, detections_dict):
- """Adds detections for a single image to be used for evaluation.
-
- Args:
- image_id: A unique string/integer identifier for the image.
- detections_dict: A dictionary containing -
- standard_fields.DetectionResultFields.detection_boxes: float32
- numpy array of shape [num_boxes, 4] containing `num_boxes`
- detection boxes of the format [ymin, xmin, ymax, xmax] in
- absolute image coordinates.
- standard_fields.DetectionResultFields.detection_scores: float32
- numpy array of shape [num_boxes] containing detection
- scores for the boxes.
- standard_fields.DetectionResultFields.detection_classes:
- integer numpy array of shape [num_boxes] containing
- 1-indexed detection classes for the boxes.
- standard_fields.DetectionResultFields.detection_masks: uint8
- numpy array of shape [num_boxes, height, width] containing
- `num_boxes` masks of values ranging between 0 and 1.
-
- Raises:
- ValueError: If detection masks are not in detections dictionary.
- """
- detection_classes = (
- detections_dict[
- standard_fields.DetectionResultFields.detection_classes] -
- self._label_id_offset)
- detection_masks = None
- if self._evaluate_masks:
- if (standard_fields.DetectionResultFields.detection_masks
- not in detections_dict):
- raise ValueError(
- 'Detection masks not in detections dictionary.')
- detection_masks = detections_dict[
- standard_fields.DetectionResultFields.detection_masks]
- self._evaluation.add_single_detected_image_info(
- image_key=image_id,
- detected_boxes=detections_dict[
- standard_fields.DetectionResultFields.detection_boxes],
- detected_scores=detections_dict[
- standard_fields.DetectionResultFields.detection_scores],
- detected_class_labels=detection_classes,
- detected_masks=detection_masks,
- )
-
- @staticmethod
- def create_category_index(categories):
- """Creates dictionary of COCO compatible categories keyed by category
- id.
-
- Args:
- categories: a list of dicts, each of which has the following keys:
- 'id': (required) an integer id uniquely identifying this
- category.
- 'name': (required) string representing category name
- e.g., 'cat', 'dog', 'pizza'.
-
- Returns:
- category_index: a dict containing the same entries as categories,
- but keyed by the 'id' field of each category.
- """
- category_index = {}
- for cat in categories:
- category_index[cat['id']] = cat
- return category_index
-
- def evaluate(self):
- """Compute evaluation result.
-
- Returns:
- A dictionary of metrics with the following fields -
-
- 1. summary_metrics:
- 'Precision/mAP@IOU': mean average
- precision at the specified IOU threshold
-
- 2. per_category_ap: category specific results with keys of the form
- 'PerformanceByCategory/mAP@IOU/category'
- """
- (per_class_ap, mean_ap, _, _, per_class_corloc,
- mean_corloc) = self._evaluation.evaluate()
-
- metric = f'mAP@{self._matching_iou_threshold}IOU'
- pascal_metrics = {self._metric_prefix + metric: mean_ap}
- if self._evaluate_corlocs:
- pascal_metrics[self._metric_prefix +
- 'Precision/meanCorLoc@{}IOU'.format(
- self._matching_iou_threshold)] = mean_corloc
- category_index = self.create_category_index(self._categories)
- for idx in range(per_class_ap.size):
- if idx + self._label_id_offset in category_index:
- display_name = (
- self._metric_prefix +
- 'PerformanceByCategory/AP@{}IOU/{}'.format(
- self._matching_iou_threshold,
- category_index[idx + self._label_id_offset]['name'],
- ))
- pascal_metrics[display_name] = per_class_ap[idx]
-
- # Optionally add CorLoc metrics.classes
- if self._evaluate_corlocs:
- display_name = (
- self._metric_prefix +
- 'PerformanceByCategory/CorLoc@{}IOU/{}'.format(
- self._matching_iou_threshold,
- category_index[idx +
- self._label_id_offset]['name'],
- ))
- pascal_metrics[display_name] = per_class_corloc[idx]
-
- return pascal_metrics
-
- def clear(self):
- """Clears the state to prepare for a fresh evaluation."""
- self._evaluation = ObjectDetectionEvaluation(
- num_groundtruth_classes=self._num_classes,
- matching_iou_threshold=self._matching_iou_threshold,
- use_weighted_mean_ap=self._use_weighted_mean_ap,
- label_id_offset=self._label_id_offset,
- )
- self._image_ids.clear()
-
-
-class PascalDetectionEvaluator(ObjectDetectionEvaluator):
- """A class to evaluate detections using PASCAL metrics."""
-
- def __init__(self, categories, matching_iou_threshold=0.5):
- super(PascalDetectionEvaluator, self).__init__(
- categories,
- matching_iou_threshold=matching_iou_threshold,
- evaluate_corlocs=False,
- use_weighted_mean_ap=False,
- )
-
-
-ObjectDetectionEvalMetrics = collections.namedtuple(
- 'ObjectDetectionEvalMetrics',
- [
- 'average_precisions',
- 'mean_ap',
- 'precisions',
- 'recalls',
- 'corlocs',
- 'mean_corloc',
- ],
-)
-
-
-class ObjectDetectionEvaluation:
- """Internal implementation of Pascal object detection metrics."""
-
- def __init__(self,
- num_groundtruth_classes,
- matching_iou_threshold=0.5,
- nms_iou_threshold=1.0,
- nms_max_output_boxes=10000,
- use_weighted_mean_ap=False,
- label_id_offset=0):
- if num_groundtruth_classes < 1:
- raise ValueError(
- 'Need at least 1 groundtruth class for evaluation.')
-
- self.per_image_eval = per_image_evaluation.PerImageEvaluation(
- num_groundtruth_classes=num_groundtruth_classes,
- matching_iou_threshold=matching_iou_threshold,
- )
- self.num_class = num_groundtruth_classes
- self.use_weighted_mean_ap = use_weighted_mean_ap
- self.label_id_offset = label_id_offset
-
- self.groundtruth_boxes = {}
- self.groundtruth_class_labels = {}
- self.groundtruth_masks = {}
- self.num_gt_instances_per_class = np.zeros(self.num_class, dtype=int)
- self.num_gt_imgs_per_class = np.zeros(self.num_class, dtype=int)
-
- self._initialize_detections()
-
- def _initialize_detections(self):
- self.detection_keys = set()
- self.scores_per_class = [[] for _ in range(self.num_class)]
- self.tp_fp_labels_per_class = [[] for _ in range(self.num_class)]
- self.num_images_correctly_detected_per_class = np.zeros(self.num_class)
- self.average_precision_per_class = np.empty(
- self.num_class, dtype=float)
- self.average_precision_per_class.fill(np.nan)
- self.precisions_per_class = []
- self.recalls_per_class = []
- self.corloc_per_class = np.ones(self.num_class, dtype=float)
-
- def clear_detections(self):
- self._initialize_detections()
-
- def add_single_ground_truth_image_info(self,
- image_key,
- groundtruth_boxes,
- groundtruth_class_labels,
- groundtruth_masks=None):
- """Adds groundtruth for a single image to be used for evaluation.
-
- Args:
- image_key: A unique string/integer identifier for the image.
- groundtruth_boxes: float32 numpy array of shape [num_boxes, 4]
- containing `num_boxes` groundtruth boxes of the format
- [ymin, xmin, ymax, xmax] in absolute image coordinates.
- groundtruth_class_labels: integer numpy array of shape [num_boxes]
- containing 0-indexed groundtruth classes for the boxes.
- groundtruth_masks: uint8 numpy array of shape
- [num_boxes, height, width] containing `num_boxes` groundtruth
- masks. The mask values range from 0 to 1.
- """
- if image_key in self.groundtruth_boxes:
- warnings.warn(('image %s has already been added to the ground '
- 'truth database.'), image_key)
- return
-
- self.groundtruth_boxes[image_key] = groundtruth_boxes
- self.groundtruth_class_labels[image_key] = groundtruth_class_labels
- self.groundtruth_masks[image_key] = groundtruth_masks
-
- self._update_ground_truth_statistics(groundtruth_class_labels)
-
- def add_single_detected_image_info(self,
- image_key,
- detected_boxes,
- detected_scores,
- detected_class_labels,
- detected_masks=None):
- """Adds detections for a single image to be used for evaluation.
-
- Args:
- image_key: A unique string/integer identifier for the image.
- detected_boxes: float32 numpy array of shape [num_boxes, 4]
- containing `num_boxes` detection boxes of the format
- [ymin, xmin, ymax, xmax] in absolute image coordinates.
- detected_scores: float32 numpy array of shape [num_boxes]
- containing detection scores for the boxes.
- detected_class_labels: integer numpy array of shape [num_boxes]
- containing 0-indexed detection classes for the boxes.
- detected_masks: np.uint8 numpy array of shape
- [num_boxes, height, width] containing `num_boxes` detection
- masks with values ranging between 0 and 1.
-
- Raises:
- ValueError: if the number of boxes, scores and class labels differ
- in length.
- """
- if len(detected_boxes) != len(detected_scores) or len(
- detected_boxes) != len(detected_class_labels):
- raise ValueError(
- 'detected_boxes, detected_scores and '
- 'detected_class_labels should all have same lengths. Got'
- '[%d, %d, %d]' % len(detected_boxes),
- len(detected_scores),
- len(detected_class_labels),
- )
-
- if image_key in self.detection_keys:
- warnings.warn(('image %s has already been added to the ground '
- 'truth database.'), image_key)
- return
-
- self.detection_keys.add(image_key)
- if image_key in self.groundtruth_boxes:
- groundtruth_boxes = self.groundtruth_boxes[image_key]
- groundtruth_class_labels = self.groundtruth_class_labels[image_key]
- # Masks are popped instead of look up. The reason is that we do not
- # want to keep all masks in memory which can cause memory overflow.
- groundtruth_masks = self.groundtruth_masks.pop(image_key)
- else:
- groundtruth_boxes = np.empty(shape=[0, 4], dtype=float)
- groundtruth_class_labels = np.array([], dtype=int)
- if detected_masks is None:
- groundtruth_masks = None
- else:
- groundtruth_masks = np.empty(shape=[0, 1, 1], dtype=float)
- (
- scores,
- tp_fp_labels,
- ) = self.per_image_eval.compute_object_detection_metrics(
- detected_boxes=detected_boxes,
- detected_scores=detected_scores,
- detected_class_labels=detected_class_labels,
- groundtruth_boxes=groundtruth_boxes,
- groundtruth_class_labels=groundtruth_class_labels,
- detected_masks=detected_masks,
- groundtruth_masks=groundtruth_masks,
- )
-
- for i in range(self.num_class):
- if scores[i].shape[0] > 0:
- self.scores_per_class[i].append(scores[i])
- self.tp_fp_labels_per_class[i].append(tp_fp_labels[i])
-
- def _update_ground_truth_statistics(self, groundtruth_class_labels):
- """Update grouth truth statitistics.
-
- Args:
- groundtruth_class_labels: An integer numpy array of length M,
- representing M class labels of object instances in ground truth
- """
- count = defaultdict(lambda: 0)
- for label in groundtruth_class_labels:
- count[label] += 1
- for k in count:
- self.num_gt_instances_per_class[k] += count[k]
- self.num_gt_imgs_per_class[k] += 1
-
- def evaluate(self):
- """Compute evaluation result.
-
- Returns:
- A named tuple with the following fields -
- average_precision: float numpy array of average precision for
- each class.
- mean_ap: mean average precision of all classes, float scalar
- precisions: List of precisions, each precision is a float numpy
- array
- recalls: List of recalls, each recall is a float numpy array
- corloc: numpy float array
- mean_corloc: Mean CorLoc score for each class, float scalar
- """
- if (self.num_gt_instances_per_class == 0).any():
- logging.info(
- 'The following classes have no ground truth examples: %s',
- np.squeeze(np.argwhere(self.num_gt_instances_per_class == 0)) +
- self.label_id_offset)
-
- if self.use_weighted_mean_ap:
- all_scores = np.array([], dtype=float)
- all_tp_fp_labels = np.array([], dtype=bool)
-
- for class_index in range(self.num_class):
- if self.num_gt_instances_per_class[class_index] == 0:
- continue
- if not self.scores_per_class[class_index]:
- scores = np.array([], dtype=float)
- tp_fp_labels = np.array([], dtype=bool)
- else:
- scores = np.concatenate(self.scores_per_class[class_index])
- tp_fp_labels = np.concatenate(
- self.tp_fp_labels_per_class[class_index])
- if self.use_weighted_mean_ap:
- all_scores = np.append(all_scores, scores)
- all_tp_fp_labels = np.append(all_tp_fp_labels, tp_fp_labels)
- precision, recall = metrics.compute_precision_recall(
- scores, tp_fp_labels,
- self.num_gt_instances_per_class[class_index])
- self.precisions_per_class.append(precision)
- self.recalls_per_class.append(recall)
- average_precision = metrics.compute_average_precision(
- precision, recall)
- self.average_precision_per_class[class_index] = average_precision
-
- self.corloc_per_class = metrics.compute_cor_loc(
- self.num_gt_imgs_per_class,
- self.num_images_correctly_detected_per_class)
-
- if self.use_weighted_mean_ap:
- num_gt_instances = np.sum(self.num_gt_instances_per_class)
- precision, recall = metrics.compute_precision_recall(
- all_scores, all_tp_fp_labels, num_gt_instances)
- mean_ap = metrics.compute_average_precision(precision, recall)
- else:
- mean_ap = np.nanmean(self.average_precision_per_class)
- mean_corloc = np.nanmean(self.corloc_per_class)
- return ObjectDetectionEvalMetrics(
- self.average_precision_per_class,
- mean_ap,
- self.precisions_per_class,
- self.recalls_per_class,
- self.corloc_per_class,
- mean_corloc,
- )
diff --git a/mmaction/evaluation/functional/ava_evaluation/per_image_evaluation.py b/mmaction/evaluation/functional/ava_evaluation/per_image_evaluation.py
deleted file mode 100644
index 9a6e0d9e40..0000000000
--- a/mmaction/evaluation/functional/ava_evaluation/per_image_evaluation.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Evaluate Object Detection result on a single image.
-
-Annotate each detected result as true positives or false positive according to
-a predefined IOU ratio. Non Maximum Suppression is used by default. Multi class
-detection is supported by default. Based on the settings, per image evaluation
-is either performed on boxes or on object masks.
-"""
-
-import numpy as np
-
-from . import np_box_list, np_box_ops
-
-
-class PerImageEvaluation:
- """Evaluate detection result of a single image."""
-
- def __init__(self, num_groundtruth_classes, matching_iou_threshold=0.5):
- """Initialized PerImageEvaluation by evaluation parameters.
-
- Args:
- num_groundtruth_classes: Number of ground truth object classes
- matching_iou_threshold: A ratio of area intersection to union,
- which is the threshold to consider whether a detection is true
- positive or not
- """
- self.matching_iou_threshold = matching_iou_threshold
- self.num_groundtruth_classes = num_groundtruth_classes
-
- def compute_object_detection_metrics(self,
- detected_boxes,
- detected_scores,
- detected_class_labels,
- groundtruth_boxes,
- groundtruth_class_labels,
- detected_masks=None,
- groundtruth_masks=None):
- """Evaluates detections as being tp, fp or ignored from a single image.
-
- The evaluation is done in two stages:
- 1. All detections are matched to non group-of boxes.
-
- Args:
- detected_boxes: A float numpy array of shape [N, 4], representing N
- regions of detected object regions.
- Each row is of the format [y_min, x_min, y_max, x_max]
- detected_scores: A float numpy array of shape [N, 1], representing
- the confidence scores of the detected N object instances.
- detected_class_labels: A integer numpy array of shape [N, 1],
- repreneting the class labels of the detected N object
- instances.
- groundtruth_boxes: A float numpy array of shape [M, 4],
- representing M regions of object instances in ground truth
- groundtruth_class_labels: An integer numpy array of shape [M, 1],
- representing M class labels of object instances in ground truth
- detected_masks: (optional) A uint8 numpy array of shape
- [N, height, width]. If not None, the metrics will be computed
- based on masks.
- groundtruth_masks: (optional) A uint8 numpy array of shape
- [M, height, width].
-
- Returns:
- scores: A list of C float numpy arrays. Each numpy array is of
- shape [K, 1], representing K scores detected with object class
- label c
- tp_fp_labels: A list of C boolean numpy arrays. Each numpy array
- is of shape [K, 1], representing K True/False positive label of
- object instances detected with class label c
- """
- (
- detected_boxes,
- detected_scores,
- detected_class_labels,
- detected_masks,
- ) = self._remove_invalid_boxes(
- detected_boxes,
- detected_scores,
- detected_class_labels,
- detected_masks,
- )
- scores, tp_fp_labels = self._compute_tp_fp(
- detected_boxes=detected_boxes,
- detected_scores=detected_scores,
- detected_class_labels=detected_class_labels,
- groundtruth_boxes=groundtruth_boxes,
- groundtruth_class_labels=groundtruth_class_labels,
- detected_masks=detected_masks,
- groundtruth_masks=groundtruth_masks,
- )
-
- return scores, tp_fp_labels
-
- def _compute_tp_fp(self,
- detected_boxes,
- detected_scores,
- detected_class_labels,
- groundtruth_boxes,
- groundtruth_class_labels,
- detected_masks=None,
- groundtruth_masks=None):
- """Labels true/false positives of detections of an image across all
- classes.
-
- Args:
- detected_boxes: A float numpy array of shape [N, 4], representing N
- regions of detected object regions.
- Each row is of the format [y_min, x_min, y_max, x_max]
- detected_scores: A float numpy array of shape [N, 1], representing
- the confidence scores of the detected N object instances.
- detected_class_labels: A integer numpy array of shape [N, 1],
- repreneting the class labels of the detected N object
- instances.
- groundtruth_boxes: A float numpy array of shape [M, 4],
- representing M regions of object instances in ground truth
- groundtruth_class_labels: An integer numpy array of shape [M, 1],
- representing M class labels of object instances in ground truth
- detected_masks: (optional) A np.uint8 numpy array of shape
- [N, height, width]. If not None, the scores will be computed
- based on masks.
- groundtruth_masks: (optional) A np.uint8 numpy array of shape
- [M, height, width].
-
- Returns:
- result_scores: A list of float numpy arrays. Each numpy array is of
- shape [K, 1], representing K scores detected with object class
- label c
- result_tp_fp_labels: A list of boolean numpy array. Each numpy
- array is of shape [K, 1], representing K True/False positive
- label of object instances detected with class label c
-
- Raises:
- ValueError: If detected masks is not None but groundtruth masks are
- None, or the other way around.
- """
- if detected_masks is not None and groundtruth_masks is None:
- raise ValueError(
- 'Detected masks is available but groundtruth masks is not.')
- if detected_masks is None and groundtruth_masks is not None:
- raise ValueError(
- 'Groundtruth masks is available but detected masks is not.')
-
- result_scores = []
- result_tp_fp_labels = []
- for i in range(self.num_groundtruth_classes):
- (gt_boxes_at_ith_class, gt_masks_at_ith_class,
- detected_boxes_at_ith_class, detected_scores_at_ith_class,
- detected_masks_at_ith_class) = self._get_ith_class_arrays(
- detected_boxes, detected_scores, detected_masks,
- detected_class_labels, groundtruth_boxes, groundtruth_masks,
- groundtruth_class_labels, i)
- scores, tp_fp_labels = self._compute_tp_fp_for_single_class(
- detected_boxes=detected_boxes_at_ith_class,
- detected_scores=detected_scores_at_ith_class,
- groundtruth_boxes=gt_boxes_at_ith_class,
- detected_masks=detected_masks_at_ith_class,
- groundtruth_masks=gt_masks_at_ith_class,
- )
- result_scores.append(scores)
- result_tp_fp_labels.append(tp_fp_labels)
- return result_scores, result_tp_fp_labels
-
- @staticmethod
- def _get_overlaps_and_scores_box_mode(detected_boxes, detected_scores,
- groundtruth_boxes):
- """Computes overlaps and scores between detected and groudntruth boxes.
-
- Args:
- detected_boxes: A numpy array of shape [N, 4] representing detected
- box coordinates
- detected_scores: A 1-d numpy array of length N representing
- classification score
- groundtruth_boxes: A numpy array of shape [M, 4] representing
- ground truth box coordinates
-
- Returns:
- iou: A float numpy array of size [num_detected_boxes,
- num_gt_boxes]. If gt_non_group_of_boxlist.num_boxes() == 0 it
- will be None.
- ioa: A float numpy array of size [num_detected_boxes,
- num_gt_boxes]. If gt_group_of_boxlist.num_boxes() == 0 it will
- be None.
- scores: The score of the detected boxlist.
- num_boxes: Number of non-maximum suppressed detected boxes.
- """
- detected_boxlist = np_box_list.BoxList(detected_boxes)
- detected_boxlist.add_field('scores', detected_scores)
- gt_non_group_of_boxlist = np_box_list.BoxList(groundtruth_boxes)
-
- iou = np_box_ops.iou(detected_boxlist.get(),
- gt_non_group_of_boxlist.get())
- scores = detected_boxlist.get_field('scores')
- num_boxes = detected_boxlist.num_boxes()
- return iou, None, scores, num_boxes
-
- def _compute_tp_fp_for_single_class(self,
- detected_boxes,
- detected_scores,
- groundtruth_boxes,
- detected_masks=None,
- groundtruth_masks=None):
- """Labels boxes detected with the same class from the same image as
- tp/fp.
-
- Args:
- detected_boxes: A numpy array of shape [N, 4] representing detected
- box coordinates
- detected_scores: A 1-d numpy array of length N representing
- classification score
- groundtruth_boxes: A numpy array of shape [M, 4] representing
- groundtruth box coordinates
- detected_masks: (optional) A uint8 numpy array of shape
- [N, height, width]. If not None, the scores will be computed
- based on masks.
- groundtruth_masks: (optional) A uint8 numpy array of shape
- [M, height, width].
-
- Returns:
- Two arrays of the same size, containing all boxes that were
- evaluated as being true positives or false positives.
-
- scores: A numpy array representing the detection scores.
- tp_fp_labels: a boolean numpy array indicating whether a detection
- is a true positive.
- """
- if detected_boxes.size == 0:
- return np.array([], dtype=float), np.array([], dtype=bool)
-
- (iou, _, scores,
- num_detected_boxes) = self._get_overlaps_and_scores_box_mode(
- detected_boxes=detected_boxes,
- detected_scores=detected_scores,
- groundtruth_boxes=groundtruth_boxes)
-
- if groundtruth_boxes.size == 0:
- return scores, np.zeros(num_detected_boxes, dtype=bool)
-
- tp_fp_labels = np.zeros(num_detected_boxes, dtype=bool)
-
- # The evaluation is done in two stages:
- # 1. All detections are matched to non group-of boxes.
- # 2. Detections that are determined as false positives are matched
- # against group-of boxes and ignored if matched.
-
- # Tp-fp evaluation for non-group of boxes (if any).
- if iou.shape[1] > 0:
- max_overlap_gt_ids = np.argmax(iou, axis=1)
- is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool)
- for i in range(num_detected_boxes):
- gt_id = max_overlap_gt_ids[i]
- if iou[i, gt_id] >= self.matching_iou_threshold:
- if not is_gt_box_detected[gt_id]:
- tp_fp_labels[i] = True
- is_gt_box_detected[gt_id] = True
-
- return scores, tp_fp_labels
-
- @staticmethod
- def _get_ith_class_arrays(detected_boxes, detected_scores, detected_masks,
- detected_class_labels, groundtruth_boxes,
- groundtruth_masks, groundtruth_class_labels,
- class_index):
- """Returns numpy arrays belonging to class with index `class_index`.
-
- Args:
- detected_boxes: A numpy array containing detected boxes.
- detected_scores: A numpy array containing detected scores.
- detected_masks: A numpy array containing detected masks.
- detected_class_labels: A numpy array containing detected class
- labels.
- groundtruth_boxes: A numpy array containing groundtruth boxes.
- groundtruth_masks: A numpy array containing groundtruth masks.
- groundtruth_class_labels: A numpy array containing groundtruth
- class labels.
- class_index: An integer index.
-
- Returns:
- gt_boxes_at_ith_class: A numpy array containing groundtruth boxes
- labeled as ith class.
- gt_masks_at_ith_class: A numpy array containing groundtruth masks
- labeled as ith class.
- detected_boxes_at_ith_class: A numpy array containing detected
- boxes corresponding to the ith class.
- detected_scores_at_ith_class: A numpy array containing detected
- scores corresponding to the ith class.
- detected_masks_at_ith_class: A numpy array containing detected
- masks corresponding to the ith class.
- """
- selected_groundtruth = groundtruth_class_labels == class_index
- gt_boxes_at_ith_class = groundtruth_boxes[selected_groundtruth]
- if groundtruth_masks is not None:
- gt_masks_at_ith_class = groundtruth_masks[selected_groundtruth]
- else:
- gt_masks_at_ith_class = None
- selected_detections = detected_class_labels == class_index
- detected_boxes_at_ith_class = detected_boxes[selected_detections]
- detected_scores_at_ith_class = detected_scores[selected_detections]
- if detected_masks is not None:
- detected_masks_at_ith_class = detected_masks[selected_detections]
- else:
- detected_masks_at_ith_class = None
- return (gt_boxes_at_ith_class, gt_masks_at_ith_class,
- detected_boxes_at_ith_class, detected_scores_at_ith_class,
- detected_masks_at_ith_class)
-
- @staticmethod
- def _remove_invalid_boxes(detected_boxes,
- detected_scores,
- detected_class_labels,
- detected_masks=None):
- """Removes entries with invalid boxes.
-
- A box is invalid if either its xmax is smaller than its xmin, or its
- ymax is smaller than its ymin.
-
- Args:
- detected_boxes: A float numpy array of size [num_boxes, 4]
- containing box coordinates in [ymin, xmin, ymax, xmax] format.
- detected_scores: A float numpy array of size [num_boxes].
- detected_class_labels: A int32 numpy array of size [num_boxes].
- detected_masks: A uint8 numpy array of size
- [num_boxes, height, width].
-
- Returns:
- valid_detected_boxes: A float numpy array of size
- [num_valid_boxes, 4] containing box coordinates in
- [ymin, xmin, ymax, xmax] format.
- valid_detected_scores: A float numpy array of size
- [num_valid_boxes].
- valid_detected_class_labels: A int32 numpy array of size
- [num_valid_boxes].
- valid_detected_masks: A uint8 numpy array of size
- [num_valid_boxes, height, width].
- """
- valid_indices = np.logical_and(
- detected_boxes[:, 0] < detected_boxes[:, 2],
- detected_boxes[:, 1] < detected_boxes[:, 3])
- detected_boxes = detected_boxes[valid_indices]
- detected_scores = detected_scores[valid_indices]
- detected_class_labels = detected_class_labels[valid_indices]
- if detected_masks is not None:
- detected_masks = detected_masks[valid_indices]
- return [
- detected_boxes, detected_scores, detected_class_labels,
- detected_masks
- ]
diff --git a/mmaction/evaluation/functional/ava_evaluation/standard_fields.py b/mmaction/evaluation/functional/ava_evaluation/standard_fields.py
deleted file mode 100644
index 8edf46d081..0000000000
--- a/mmaction/evaluation/functional/ava_evaluation/standard_fields.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""Contains classes specifying naming conventions used for object detection.
-
-Specifies:
- InputDataFields: standard fields used by reader/preprocessor/batcher.
- DetectionResultFields: standard fields returned by object detector.
-"""
-
-
-class InputDataFields:
- """Names for the input tensors.
-
- Holds the standard data field names to use for identifying input tensors.
- This should be used by the decoder to identify keys for the returned
- tensor_dict containing input tensors. And it should be used by the model to
- identify the tensors it needs.
-
- Attributes:
- image: image.
- original_image: image in the original input size.
- key: unique key corresponding to image.
- source_id: source of the original image.
- filename: original filename of the dataset (without common path).
- groundtruth_image_classes: image-level class labels.
- groundtruth_boxes: coordinates of the ground truth boxes in the image.
- groundtruth_classes: box-level class labels.
- groundtruth_label_types: box-level label types (e.g. explicit
- negative).
- groundtruth_is_crowd: [DEPRECATED, use groundtruth_group_of instead]
- is the groundtruth a single object or a crowd.
- groundtruth_area: area of a groundtruth segment.
- groundtruth_difficult: is a `difficult` object
- groundtruth_group_of: is a `group_of` objects, e.g. multiple objects of
- the same class, forming a connected group, where instances are
- heavily occluding each other.
- proposal_boxes: coordinates of object proposal boxes.
- proposal_objectness: objectness score of each proposal.
- groundtruth_instance_masks: ground truth instance masks.
- groundtruth_instance_boundaries: ground truth instance boundaries.
- groundtruth_instance_classes: instance mask-level class labels.
- groundtruth_keypoints: ground truth keypoints.
- groundtruth_keypoint_visibilities: ground truth keypoint visibilities.
- groundtruth_label_scores: groundtruth label scores.
- groundtruth_weights: groundtruth weight factor for bounding boxes.
- num_groundtruth_boxes: number of groundtruth boxes.
- true_image_shapes: true shapes of images in the resized images, as
- resized images can be padded with zeros.
- """
-
- image = 'image'
- original_image = 'original_image'
- key = 'key'
- source_id = 'source_id'
- filename = 'filename'
- groundtruth_image_classes = 'groundtruth_image_classes'
- groundtruth_boxes = 'groundtruth_boxes'
- groundtruth_classes = 'groundtruth_classes'
- groundtruth_label_types = 'groundtruth_label_types'
- groundtruth_is_crowd = 'groundtruth_is_crowd'
- groundtruth_area = 'groundtruth_area'
- groundtruth_difficult = 'groundtruth_difficult'
- groundtruth_group_of = 'groundtruth_group_of'
- proposal_boxes = 'proposal_boxes'
- proposal_objectness = 'proposal_objectness'
- groundtruth_instance_masks = 'groundtruth_instance_masks'
- groundtruth_instance_boundaries = 'groundtruth_instance_boundaries'
- groundtruth_instance_classes = 'groundtruth_instance_classes'
- groundtruth_keypoints = 'groundtruth_keypoints'
- groundtruth_keypoint_visibilities = 'groundtruth_keypoint_visibilities'
- groundtruth_label_scores = 'groundtruth_label_scores'
- groundtruth_weights = 'groundtruth_weights'
- num_groundtruth_boxes = 'num_groundtruth_boxes'
- true_image_shape = 'true_image_shape'
-
-
-class DetectionResultFields:
- """Naming conventions for storing the output of the detector.
-
- Attributes:
- source_id: source of the original image.
- key: unique key corresponding to image.
- detection_boxes: coordinates of the detection boxes in the image.
- detection_scores: detection scores for the detection boxes in the
- image.
- detection_classes: detection-level class labels.
- detection_masks: contains a segmentation mask for each detection box.
- detection_boundaries: contains an object boundary for each detection
- box.
- detection_keypoints: contains detection keypoints for each detection
- box.
- num_detections: number of detections in the batch.
- """
-
- source_id = 'source_id'
- key = 'key'
- detection_boxes = 'detection_boxes'
- detection_scores = 'detection_scores'
- detection_classes = 'detection_classes'
- detection_masks = 'detection_masks'
- detection_boundaries = 'detection_boundaries'
- detection_keypoints = 'detection_keypoints'
- num_detections = 'num_detections'
diff --git a/mmaction/evaluation/functional/ava_utils.py b/mmaction/evaluation/functional/ava_utils.py
index cb739a4a9b..c15737632c 100644
--- a/mmaction/evaluation/functional/ava_utils.py
+++ b/mmaction/evaluation/functional/ava_utils.py
@@ -3,14 +3,13 @@
# https://github.com/activitynet/ActivityNet/blob/master/
# Evaluation/get_ava_performance.py. Some unused codes are removed.
import csv
-import logging
+import multiprocessing
import time
from collections import defaultdict
import numpy as np
-from .ava_evaluation import object_detection_evaluation as det_eval
-from .ava_evaluation import standard_fields
+from .ava_evaluation import metrics, np_box_list, np_box_ops
def det2csv(results, custom_classes):
@@ -42,7 +41,7 @@ def results2csv(results, out_file, custom_classes=None):
# save space for float
def to_str(item):
if isinstance(item, float):
- return f'{item:.3f}'
+ return f'{item:.4f}'
return str(item)
with open(out_file, 'w') as f:
@@ -80,7 +79,6 @@ def read_csv(csv_file, class_whitelist=None):
of score values labels, matching the corresponding label in `labels`.
If scores are not provided in the csv, then they will default to 1.0.
"""
- start = time.time()
entries = defaultdict(list)
boxes = defaultdict(list)
labels = defaultdict(list)
@@ -107,7 +105,6 @@ def read_csv(csv_file, class_whitelist=None):
labels[image_key] = [x[1] for x in entry]
scores[image_key] = [x[0] for x in entry]
- print_time('read file ' + csv_file.name, start)
return boxes, labels, scores
@@ -157,6 +154,51 @@ def read_labelmap(labelmap_file):
return labelmap, class_ids
+def get_overlaps_and_scores_box_mode(detected_boxes, detected_scores,
+ groundtruth_boxes):
+
+ detected_boxlist = np_box_list.BoxList(detected_boxes)
+ detected_boxlist.add_field('scores', detected_scores)
+ gt_non_group_of_boxlist = np_box_list.BoxList(groundtruth_boxes)
+
+ iou = np_box_ops.iou(detected_boxlist.get(), gt_non_group_of_boxlist.get())
+ scores = detected_boxlist.get_field('scores')
+ num_boxes = detected_boxlist.num_boxes()
+ return iou, scores, num_boxes
+
+
+def tpfp_single(tup, threshold=0.5):
+ gt_bboxes, gt_labels, bboxes, labels, scores = tup
+ ret_scores, ret_tp_fp_labels = dict(), dict()
+ all_labels = list(set(labels))
+ for label in all_labels:
+ gt_bbox = np.array(
+ [x for x, y in zip(gt_bboxes, gt_labels) if y == label],
+ dtype=np.float32).reshape(-1, 4)
+ bbox = np.array([x for x, y in zip(bboxes, labels) if y == label],
+ dtype=np.float32).reshape(-1, 4)
+ score = np.array([x for x, y in zip(scores, labels) if y == label],
+ dtype=np.float32).reshape(-1)
+ iou, score, num_boxes = get_overlaps_and_scores_box_mode(
+ bbox, score, gt_bbox)
+ if gt_bbox.size == 0:
+ ret_scores[label] = score
+ ret_tp_fp_labels[label] = np.zeros(num_boxes, dtype=bool)
+ continue
+ tp_fp_labels = np.zeros(num_boxes, dtype=bool)
+ if iou.shape[1] > 0:
+ max_overlap_gt_ids = np.argmax(iou, axis=1)
+ is_gt_box_detected = np.zeros(iou.shape[1], dtype=bool)
+ for i in range(num_boxes):
+ gt_id = max_overlap_gt_ids[i]
+ if iou[i, gt_id] >= threshold:
+ if not is_gt_box_detected[gt_id]:
+ tp_fp_labels[i] = True
+ is_gt_box_detected[gt_id] = True
+ ret_scores[label], ret_tp_fp_labels[label] = score, tp_fp_labels
+ return ret_scores, ret_tp_fp_labels
+
+
# Seems there is at most 100 detections for each image
def ava_eval(result_file,
result_type,
@@ -164,10 +206,11 @@ def ava_eval(result_file,
ann_file,
exclude_file,
verbose=True,
+ ignore_empty_frames=True,
custom_classes=None):
"""Perform ava evaluation."""
- assert result_type in ['mAP']
+ assert result_type in ['mAP']
start = time.time()
categories, class_whitelist = read_labelmap(open(label_file))
if custom_classes is not None:
@@ -177,9 +220,9 @@ def ava_eval(result_file,
categories = [cat for cat in categories if cat['id'] in custom_classes]
# loading gt, do not need gt score
- gt_boxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist)
+ gt_bboxes, gt_labels, _ = read_csv(open(ann_file), class_whitelist)
if verbose:
- print_time('Reading detection results', start)
+ print_time('Reading GT results', start)
if exclude_file is not None:
excluded_keys = read_exclusions(open(exclude_file))
@@ -189,54 +232,69 @@ def ava_eval(result_file,
start = time.time()
boxes, labels, scores = read_csv(open(result_file), class_whitelist)
if verbose:
- print_time('Reading detection results', start)
-
- # Evaluation for mAP
- pascal_evaluator = det_eval.PascalDetectionEvaluator(categories)
+ print_time('Reading Detection results', start)
start = time.time()
- for image_key in gt_boxes:
- if verbose and image_key in excluded_keys:
- logging.info(
- 'Found excluded timestamp in detections: %s.'
- 'It will be ignored.', image_key)
- continue
- pascal_evaluator.add_single_ground_truth_image_info(
- image_key, {
- standard_fields.InputDataFields.groundtruth_boxes:
- np.array(gt_boxes[image_key], dtype=float),
- standard_fields.InputDataFields.groundtruth_classes:
- np.array(gt_labels[image_key], dtype=int)
- })
+ all_gt_labels = np.concatenate(list(gt_labels.values()))
+ gt_count = {k: np.sum(all_gt_labels == k) for k in class_whitelist}
+
+ pool = multiprocessing.Pool(32)
+ if ignore_empty_frames:
+ tups = [(gt_bboxes[k], gt_labels[k], boxes[k], labels[k], scores[k])
+ for k in gt_bboxes if k not in excluded_keys]
+ else:
+ tups = [(gt_bboxes.get(k, np.zeros((0, 4), dtype=np.float32)),
+ gt_labels.get(k, []), boxes[k], labels[k], scores[k])
+ for k in boxes if k not in excluded_keys]
+ rets = pool.map(tpfp_single, tups)
+
if verbose:
- print_time('Convert groundtruth', start)
+ print_time('Calculating TP/FP', start)
start = time.time()
- for image_key in boxes:
- if verbose and image_key in excluded_keys:
- logging.info(
- 'Found excluded timestamp in detections: %s.'
- 'It will be ignored.', image_key)
- continue
- pascal_evaluator.add_single_detected_image_info(
- image_key, {
- standard_fields.DetectionResultFields.detection_boxes:
- np.array(boxes[image_key], dtype=float),
- standard_fields.DetectionResultFields.detection_classes:
- np.array(labels[image_key], dtype=int),
- standard_fields.DetectionResultFields.detection_scores:
- np.array(scores[image_key], dtype=float)
- })
+ scores, tpfps = defaultdict(list), defaultdict(list)
+ for score, tpfp in rets:
+ for k in score:
+ scores[k].append(score[k])
+ tpfps[k].append(tpfp[k])
+
+ cls_AP = []
+ for k in scores:
+ scores[k] = np.concatenate(scores[k])
+ tpfps[k] = np.concatenate(tpfps[k])
+ precision, recall = metrics.compute_precision_recall(
+ scores[k], tpfps[k], gt_count[k])
+ ap = metrics.compute_average_precision(precision, recall)
+ class_name = [x['name'] for x in categories if x['id'] == k]
+ assert len(class_name) == 1
+ class_name = class_name[0]
+ cls_AP.append((k, class_name, ap))
if verbose:
- print_time('convert detections', start)
+ print_time('Run Evaluator', start)
+
+ print('Per-class results: ', flush=True)
+ for k, class_name, ap in cls_AP:
+ print(f'Index: {k}, Action: {class_name}: AP: {ap:.4f};', flush=True)
+
+ overall = np.nanmean([x[2] for x in cls_AP])
+ person_movement = np.nanmean([x[2] for x in cls_AP if x[0] <= 14])
+ object_manipulation = np.nanmean([x[2] for x in cls_AP if 14 < x[0] < 64])
+ person_interaction = np.nanmean([x[2] for x in cls_AP if 64 <= x[0]])
+
+ print('Overall Results: ', flush=True)
+ print(f'Overall mAP: {overall:.4f}', flush=True)
+ print(f'Person Movement mAP: {person_movement:.4f}', flush=True)
+ print(f'Object Manipulation mAP: {object_manipulation:.4f}', flush=True)
+ print(f'Person Interaction mAP: {person_interaction:.4f}', flush=True)
+
+ results = {}
+ results['overall'] = overall
+ results['person_movement'] = person_movement
+ results['object_manipulation'] = object_manipulation
+ results['person_interaction'] = person_interaction
- start = time.time()
- metrics = pascal_evaluator.evaluate()
if verbose:
- print_time('run_evaluator', start)
- for display_name in metrics:
- print(f'{display_name}=\t{metrics[display_name]}')
- return {
- display_name: metrics[display_name]
- for display_name in metrics if 'ByCategory' not in display_name
- }
+ for k, class_name, ap in cls_AP:
+ print(f'Class {class_name} AP: {ap:.4f}', flush=True)
+
+ return results
diff --git a/mmaction/evaluation/functional/eval_detection.py b/mmaction/evaluation/functional/eval_detection.py
index 2af3ada0db..b081d52b9b 100644
--- a/mmaction/evaluation/functional/eval_detection.py
+++ b/mmaction/evaluation/functional/eval_detection.py
@@ -220,8 +220,8 @@ def compute_average_precision_detection(ground_truth,
if fp[t_idx, idx] == 0 and tp[t_idx, idx] == 0:
fp[t_idx, idx] = 1
- tp_cumsum = np.cumsum(tp, axis=1).astype(np.float)
- fp_cumsum = np.cumsum(fp, axis=1).astype(np.float)
+ tp_cumsum = np.cumsum(tp, axis=1).astype(np.float64)
+ fp_cumsum = np.cumsum(fp, axis=1).astype(np.float64)
recall_cumsum = tp_cumsum / num_positive
precision_cumsum = tp_cumsum / (tp_cumsum + fp_cumsum)
diff --git a/mmaction/evaluation/metrics/__init__.py b/mmaction/evaluation/metrics/__init__.py
index 46988d39c1..0493dae036 100644
--- a/mmaction/evaluation/metrics/__init__.py
+++ b/mmaction/evaluation/metrics/__init__.py
@@ -1,6 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.
-from .acc_metric import AccMetric
+from .acc_metric import AccMetric, ConfusionMatrix
from .anet_metric import ANetMetric
from .ava_metric import AVAMetric
-__all__ = ['AccMetric', 'AVAMetric', 'ANetMetric']
+__all__ = ['AccMetric', 'AVAMetric', 'ANetMetric', 'ConfusionMatrix']
diff --git a/mmaction/evaluation/metrics/acc_metric.py b/mmaction/evaluation/metrics/acc_metric.py
index 488e28aa14..512b089327 100644
--- a/mmaction/evaluation/metrics/acc_metric.py
+++ b/mmaction/evaluation/metrics/acc_metric.py
@@ -1,16 +1,31 @@
# Copyright (c) OpenMMLab. All rights reserved.
import copy
from collections import OrderedDict
-from typing import Any, Optional, Sequence, Tuple, Union
+from itertools import product
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+import mmengine
import numpy as np
+import torch
from mmengine.evaluator import BaseMetric
-from mmaction.evaluation import (mean_average_precision, mean_class_accuracy,
+from mmaction.evaluation import (get_weighted_score, mean_average_precision,
+ mean_class_accuracy,
mmit_mean_average_precision, top_k_accuracy)
from mmaction.registry import METRICS
+def to_tensor(value):
+ """Convert value to torch.Tensor."""
+ if isinstance(value, np.ndarray):
+ value = torch.from_numpy(value)
+ elif isinstance(value, Sequence) and not mmengine.is_str(value):
+ value = torch.tensor(value)
+ elif not isinstance(value, torch.Tensor):
+ raise TypeError(f'{type(value)} is not an available argument.')
+ return value
+
+
@METRICS.register_module()
class AccMetric(BaseMetric):
"""Accuracy evaluation metric."""
@@ -22,7 +37,7 @@ def __init__(
Tuple[str]]] = ('top_k_accuracy',
'mean_class_accuracy'),
collect_device: str = 'cpu',
- metric_options: Optional[dict] = dict(
+ metric_options: Optional[Dict] = dict(
top_k_accuracy=dict(topk=(1, 5))),
prefix: Optional[str] = None,
num_classes: Optional[int] = None):
@@ -56,38 +71,84 @@ def __init__(
self.metric_options = metric_options
self.num_classes = num_classes
- def process(self, data_batch: Sequence[Tuple[Any, dict]],
- data_samples: Sequence[dict]) -> None:
+ def process(self, data_batch: Sequence[Tuple[Any, Dict]],
+ data_samples: Sequence[Dict]) -> None:
"""Process one batch of data samples and data_samples. The processed
results should be stored in ``self.results``, which will be used to
compute the metrics when all batches have been processed.
Args:
- data_batch (Sequence[Tuple[Any, dict]]): A batch of data
- from the dataloader.
- data_samples (Sequence[dict]): A batch of outputs from
- the model.
+ data_batch (Sequence[dict]): A batch of data from the dataloader.
+ data_samples (Sequence[dict]): A batch of outputs from the model.
"""
+ data_samples = copy.deepcopy(data_samples)
for data_sample in data_samples:
result = dict()
pred = data_sample['pred_scores']
label = data_sample['gt_labels']
- result['pred'] = pred['item'].cpu().numpy()
+ for item_name, score in pred.items():
+ pred[item_name] = score.cpu().numpy()
+ result['pred'] = pred
result['label'] = label['item'].item()
self.results.append(result)
- def compute_metrics(self, results: list) -> dict:
+ def compute_metrics(self, results: List) -> Dict:
"""Compute the metrics from processed results.
Args:
results (list): The processed results of each batch.
+
Returns:
dict: The computed metrics. The keys are the names of the metrics,
and the values are corresponding results.
"""
- preds = [x['pred'] for x in results]
labels = [x['label'] for x in results]
+ if len(results[0]['pred']) == 1:
+ preds = [x['pred']['item'] for x in results]
+ return self.calculate(preds, labels)
+
+ eval_results = dict()
+ for item_name in results[0]['pred'].keys():
+ preds = [x['pred'][item_name] for x in results]
+ eval_result = self.calculate(preds, labels)
+ eval_results.update(
+ {f'{item_name}_{k}': v
+ for k, v in eval_result.items()})
+
+ # Ad-hoc for RGBPoseConv3D
+ if len(results[0]['pred']) == 2 and \
+ 'rgb' in results[0]['pred'] and \
+ 'pose' in results[0]['pred']:
+
+ rgb = [x['pred']['rgb'] for x in results]
+ pose = [x['pred']['pose'] for x in results]
+
+ preds = {
+ '1:1': get_weighted_score([rgb, pose], [1, 1]),
+ '2:1': get_weighted_score([rgb, pose], [2, 1]),
+ '1:2': get_weighted_score([rgb, pose], [1, 2])
+ }
+ for k in preds:
+ eval_result = self.calculate(preds[k], labels)
+ eval_results.update({
+ f'RGBPose_{k}_{key}': v
+ for key, v in eval_result.items()
+ })
+
+ return eval_results
+
+ def calculate(self, preds: List[np.ndarray], labels: List[int]) -> Dict:
+ """Compute the metrics from processed results.
+
+ Args:
+ preds (list[np.ndarray]): List of the prediction scores.
+ labels (list[int]): List of the labels.
+
+ Returns:
+ dict: The computed metrics. The keys are the names of the metrics,
+ and the values are corresponding results.
+ """
eval_results = OrderedDict()
metric_options = copy.deepcopy(self.metric_options)
for metric in self.metrics:
@@ -136,3 +197,198 @@ def label2array(num, label):
arr = np.zeros(num, dtype=np.float32)
arr[label] = 1.
return arr
+
+
+@METRICS.register_module()
+class ConfusionMatrix(BaseMetric):
+ r"""A metric to calculate confusion matrix for single-label tasks.
+
+ Args:
+ num_classes (int, optional): The number of classes. Defaults to None.
+ collect_device (str): Device name used for collecting results from
+ different ranks during distributed training. Must be 'cpu' or
+ 'gpu'. Defaults to 'cpu'.
+ prefix (str, optional): The prefix that will be added in the metric
+ names to disambiguate homonymous metrics of different evaluators.
+ If prefix is not provided in the argument, self.default_prefix
+ will be used instead. Defaults to None.
+
+ Examples:
+
+ 1. The basic usage.
+
+ >>> import torch
+ >>> from mmaction.evaluation import ConfusionMatrix
+ >>> y_pred = [0, 1, 1, 3]
+ >>> y_true = [0, 2, 1, 3]
+ >>> ConfusionMatrix.calculate(y_pred, y_true, num_classes=4)
+ tensor([[1, 0, 0, 0],
+ [0, 1, 0, 0],
+ [0, 1, 0, 0],
+ [0, 0, 0, 1]])
+ >>> # plot the confusion matrix
+ >>> import matplotlib.pyplot as plt
+ >>> y_score = torch.rand((1000, 10))
+ >>> y_true = torch.randint(10, (1000, ))
+ >>> matrix = ConfusionMatrix.calculate(y_score, y_true)
+ >>> ConfusionMatrix().plot(matrix)
+ >>> plt.show()
+
+ 2. In the config file
+
+ .. code:: python
+
+ val_evaluator = dict(type='ConfusionMatrix')
+ test_evaluator = dict(type='ConfusionMatrix')
+ """ # noqa: E501
+ default_prefix = 'confusion_matrix'
+
+ def __init__(self,
+ num_classes: Optional[int] = None,
+ collect_device: str = 'cpu',
+ prefix: Optional[str] = None) -> None:
+ super().__init__(collect_device, prefix)
+
+ self.num_classes = num_classes
+
+ def process(self, data_batch, data_samples: Sequence[dict]) -> None:
+ for data_sample in data_samples:
+ pred_scores = data_sample.get('pred_scores')
+ gt_label = data_sample['gt_labels']['item']
+ if pred_scores is not None:
+ pred_label = pred_scores['item'].argmax(dim=0, keepdim=True)
+ self.num_classes = pred_scores['item'].size(0)
+ else:
+ pred_label = data_sample['pred_labels']['item']
+
+ self.results.append({
+ 'pred_label': pred_label,
+ 'gt_label': gt_label
+ })
+
+ def compute_metrics(self, results: list) -> dict:
+ pred_labels = []
+ gt_labels = []
+ for result in results:
+ pred_labels.append(result['pred_label'])
+ gt_labels.append(result['gt_label'])
+ confusion_matrix = ConfusionMatrix.calculate(
+ torch.cat(pred_labels),
+ torch.cat(gt_labels),
+ num_classes=self.num_classes)
+ return {'result': confusion_matrix}
+
+ @staticmethod
+ def calculate(pred, target, num_classes=None) -> dict:
+ """Calculate the confusion matrix for single-label task.
+
+ Args:
+ pred (torch.Tensor | np.ndarray | Sequence): The prediction
+ results. It can be labels (N, ), or scores of every
+ class (N, C).
+ target (torch.Tensor | np.ndarray | Sequence): The target of
+ each prediction with shape (N, ).
+ num_classes (Optional, int): The number of classes. If the ``pred``
+ is label instead of scores, this argument is required.
+ Defaults to None.
+
+ Returns:
+ torch.Tensor: The confusion matrix.
+ """
+ pred = to_tensor(pred)
+ target_label = to_tensor(target).int()
+
+ assert pred.size(0) == target_label.size(0), \
+ f"The size of pred ({pred.size(0)}) doesn't match "\
+ f'the target ({target_label.size(0)}).'
+ assert target_label.ndim == 1
+
+ if pred.ndim == 1:
+ assert num_classes is not None, \
+ 'Please specify the `num_classes` if the `pred` is labels ' \
+ 'intead of scores.'
+ pred_label = pred
+ else:
+ num_classes = num_classes or pred.size(1)
+ pred_label = torch.argmax(pred, dim=1).flatten()
+
+ with torch.no_grad():
+ indices = num_classes * target_label + pred_label
+ matrix = torch.bincount(indices, minlength=num_classes**2)
+ matrix = matrix.reshape(num_classes, num_classes)
+
+ return matrix
+
+ @staticmethod
+ def plot(confusion_matrix: torch.Tensor,
+ include_values: bool = False,
+ cmap: str = 'viridis',
+ classes: Optional[List[str]] = None,
+ colorbar: bool = True,
+ show: bool = True):
+ """Draw a confusion matrix by matplotlib.
+
+ Modified from `Scikit-Learn
+ `_
+
+ Args:
+ confusion_matrix (torch.Tensor): The confusion matrix to draw.
+ include_values (bool): Whether to draw the values in the figure.
+ Defaults to False.
+ cmap (str): The color map to use. Defaults to use "viridis".
+ classes (list[str], optional): The names of categories.
+ Defaults to None, which means to use index number.
+ colorbar (bool): Whether to show the colorbar. Defaults to True.
+ show (bool): Whether to show the figure immediately.
+ Defaults to True.
+ """ # noqa: E501
+ import matplotlib.pyplot as plt
+
+ fig, ax = plt.subplots(figsize=(10, 10))
+
+ num_classes = confusion_matrix.size(0)
+
+ im_ = ax.imshow(confusion_matrix, interpolation='nearest', cmap=cmap)
+ text_ = None
+ cmap_min, cmap_max = im_.cmap(0), im_.cmap(1.0)
+
+ if include_values:
+ text_ = np.empty_like(confusion_matrix, dtype=object)
+
+ # print text with appropriate color depending on background
+ thresh = (confusion_matrix.max() + confusion_matrix.min()) / 2.0
+
+ for i, j in product(range(num_classes), range(num_classes)):
+ color = cmap_max if confusion_matrix[i,
+ j] < thresh else cmap_min
+
+ text_cm = format(confusion_matrix[i, j], '.2g')
+ text_d = format(confusion_matrix[i, j], 'd')
+ if len(text_d) < len(text_cm):
+ text_cm = text_d
+
+ text_[i, j] = ax.text(
+ j, i, text_cm, ha='center', va='center', color=color)
+
+ display_labels = classes or np.arange(num_classes)
+
+ if colorbar:
+ fig.colorbar(im_, ax=ax)
+ ax.set(
+ xticks=np.arange(num_classes),
+ yticks=np.arange(num_classes),
+ xticklabels=display_labels,
+ yticklabels=display_labels,
+ ylabel='True label',
+ xlabel='Predicted label',
+ )
+ ax.invert_yaxis()
+ ax.xaxis.tick_top()
+
+ ax.set_ylim((num_classes - 0.5, -0.5))
+ # Automatically rotate the x labels.
+ fig.autofmt_xdate(ha='center')
+
+ if show:
+ plt.show()
+ return fig
diff --git a/mmaction/evaluation/metrics/ava_metric.py b/mmaction/evaluation/metrics/ava_metric.py
index 66e8fdcc4a..76cc83e6c5 100644
--- a/mmaction/evaluation/metrics/ava_metric.py
+++ b/mmaction/evaluation/metrics/ava_metric.py
@@ -81,6 +81,7 @@ def compute_metrics(self, results: list) -> dict:
self.label_file,
self.ann_file,
self.exclude_file,
+ ignore_empty_frames=True,
custom_classes=self.custom_classes)
os.remove(temp_file)
diff --git a/mmaction/models/backbones/__init__.py b/mmaction/models/backbones/__init__.py
index 066ba18535..2f4eb4a7e3 100644
--- a/mmaction/models/backbones/__init__.py
+++ b/mmaction/models/backbones/__init__.py
@@ -15,6 +15,7 @@
from .resnet_omni import OmniResNet
from .resnet_tin import ResNetTIN
from .resnet_tsm import ResNetTSM
+from .rgbposeconv3d import RGBPoseConv3D
from .stgcn import STGCN
from .swin import SwinTransformer3D
from .tanet import TANet
@@ -29,5 +30,6 @@
'OmniResNet', 'ResNet', 'ResNet2Plus1d', 'ResNet3d', 'ResNet3dCSN',
'ResNet3dLayer', 'ResNet3dSlowFast', 'ResNet3dSlowOnly', 'ResNetAudio',
'ResNetTIN', 'ResNetTSM', 'STGCN', 'SwinTransformer3D', 'TANet',
- 'TimeSformer', 'UniFormer', 'UniFormerV2', 'VisionTransformer', 'X3D'
+ 'TimeSformer', 'UniFormer', 'UniFormerV2', 'VisionTransformer', 'X3D',
+ 'RGBPoseConv3D'
]
diff --git a/mmaction/models/backbones/mobilenet_v2_tsm.py b/mmaction/models/backbones/mobilenet_v2_tsm.py
index db2999a8b3..2df95ab47c 100644
--- a/mmaction/models/backbones/mobilenet_v2_tsm.py
+++ b/mmaction/models/backbones/mobilenet_v2_tsm.py
@@ -21,6 +21,8 @@ def __init__(self, num_segments=8, is_shift=True, shift_div=8, **kwargs):
self.num_segments = num_segments
self.is_shift = is_shift
self.shift_div = shift_div
+ super().init_weights()
+ self.init_structure()
def make_temporal_shift(self):
"""Make temporal shift for some layers."""
@@ -33,9 +35,11 @@ def make_temporal_shift(self):
shift_div=self.shift_div,
)
- def init_weights(self):
+ def init_structure(self):
"""Initiate the parameters either from existing checkpoint or from
scratch."""
- super().init_weights()
if self.is_shift:
self.make_temporal_shift()
+
+ def init_weights(self):
+ pass
diff --git a/mmaction/models/backbones/resnet.py b/mmaction/models/backbones/resnet.py
index 0ebf6d61b0..c599bcc311 100644
--- a/mmaction/models/backbones/resnet.py
+++ b/mmaction/models/backbones/resnet.py
@@ -497,7 +497,8 @@ def _load_bn_params(bn: nn.Module, state_dict_tv: OrderedDict,
def _load_torchvision_checkpoint(self,
logger: mmengine.MMLogger = None) -> None:
"""Initiate the parameters from torchvision pretrained checkpoint."""
- state_dict_torchvision = _load_checkpoint(self.pretrained)
+ state_dict_torchvision = _load_checkpoint(
+ self.pretrained, map_location='cpu')
if 'state_dict' in state_dict_torchvision:
state_dict_torchvision = state_dict_torchvision['state_dict']
diff --git a/mmaction/models/backbones/resnet3d.py b/mmaction/models/backbones/resnet3d.py
index 50435c3064..63b32fc8cd 100644
--- a/mmaction/models/backbones/resnet3d.py
+++ b/mmaction/models/backbones/resnet3d.py
@@ -1,22 +1,23 @@
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from collections import OrderedDict
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
import torch.utils.checkpoint as cp
from mmcv.cnn import ConvModule, NonLocal3d, build_activation_layer
from mmengine.logging import MMLogger
+from mmengine.model import BaseModule, Sequential
from mmengine.model.weight_init import constant_init, kaiming_init
from mmengine.runner.checkpoint import _load_checkpoint, load_checkpoint
from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
-from torch import Tensor, nn
from torch.nn.modules.utils import _ntuple, _triple
from mmaction.registry import MODELS
-from mmaction.utils import ConfigType, OptConfigType
-class BasicBlock3d(nn.Module):
+class BasicBlock3d(BaseModule):
"""BasicBlock 3d block for ResNet3D.
Args:
@@ -28,22 +29,24 @@ class BasicBlock3d(nn.Module):
Defaults to 1.
dilation (int): Spacing between kernel elements. Defaults to 1.
downsample (nn.Module or None): Downsample layer. Defaults to None.
- style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+ style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
stride-two layer is the 3x3 conv layer, otherwise the stride-two
- layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+ layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
inflate (bool): Whether to inflate kernel. Defaults to True.
non_local (bool): Determine whether to apply non-local module in this
block. Defaults to False.
- non_local_cfg (dict or ConfigDict): Config for non-local module.
+ non_local_cfg (dict): Config for non-local module.
Defaults to ``dict()``.
- conv_cfg (dict or ConfigDict): Config dict for convolution layer.
+ conv_cfg (dict): Config dict for convolution layer.
Defaults to ``dict(type='Conv3d')``.
- norm_cfg (dict or ConfigDict): Config for norm layers.
+ norm_cfg (dict): Config for norm layers.
Required keys are ``type``. Defaults to ``dict(type='BN3d')``.
- act_cfg (dict or ConfigDict): Config dict for activation layer.
+ act_cfg (dict): Config dict for activation layer.
Defaults to ``dict(type='ReLU')``.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed. Defaults to False.
+ init_cfg (dict or list[dict], optional): Initialization config dict.
+ Defaults to None.
"""
expansion = 1
@@ -57,13 +60,14 @@ def __init__(self,
style: str = 'pytorch',
inflate: bool = True,
non_local: bool = False,
- non_local_cfg: ConfigType = dict(),
- conv_cfg: ConfigType = dict(type='Conv3d'),
- norm_cfg: ConfigType = dict(type='BN3d'),
- act_cfg: ConfigType = dict(type='ReLU'),
+ non_local_cfg: Dict = dict(),
+ conv_cfg: Dict = dict(type='Conv3d'),
+ norm_cfg: Dict = dict(type='BN3d'),
+ act_cfg: Dict = dict(type='ReLU'),
with_cp: bool = False,
+ init_cfg: Optional[Union[Dict, List[Dict]]] = None,
**kwargs) -> None:
- super().__init__()
+ super().__init__(init_cfg=init_cfg)
assert style in ['pytorch', 'caffe']
# make sure that only ``inflate_style`` is passed into kwargs
assert set(kwargs).issubset(['inflate_style'])
@@ -130,7 +134,7 @@ def __init__(self,
self.non_local_block = NonLocal3d(self.conv2.norm.num_features,
**self.non_local_cfg)
- def forward(self, x: Tensor) -> Tensor:
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Defines the computation performed at every call."""
def _inner_forward(x):
@@ -158,7 +162,7 @@ def _inner_forward(x):
return out
-class Bottleneck3d(nn.Module):
+class Bottleneck3d(BaseModule):
"""Bottleneck 3d block for ResNet3D.
Args:
@@ -170,25 +174,27 @@ class Bottleneck3d(nn.Module):
Defaults to 1.
dilation (int): Spacing between kernel elements. Defaults to 1.
downsample (nn.Module, optional): Downsample layer. Defaults to None.
- style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+ style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
stride-two layer is the 3x3 conv layer, otherwise the stride-two
- layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+ layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
inflate (bool): Whether to inflate kernel. Defaults to True.
- inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
+ inflate_style (str): '3x1x1' or '3x3x3'. which determines the
kernel sizes and padding strides for conv1 and conv2 in each block.
- Defaults to ``3x1x1``.
+ Defaults to ``'3x1x1'``.
non_local (bool): Determine whether to apply non-local module in this
block. Defaults to False.
- non_local_cfg (dict or ConfigDict): Config for non-local module.
+ non_local_cfg (dict): Config for non-local module.
Defaults to ``dict()``.
- conv_cfg (dict or ConfigDict): Config dict for convolution layer.
+ conv_cfg (dict): Config dict for convolution layer.
Defaults to ``dict(type='Conv3d')``.
- norm_cfg (dict or ConfigDict): Config for norm layers. required
+ norm_cfg (dict): Config for norm layers. required
keys are ``type``. Defaults to ``dict(type='BN3d')``.
- act_cfg (dict or ConfigDict): Config dict for activation layer.
+ act_cfg (dict): Config dict for activation layer.
Defaults to ``dict(type='ReLU')``.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed. Defaults to False.
+ init_cfg (dict or list[dict], optional): Initialization config dict.
+ Defaults to None.
"""
expansion = 4
@@ -203,12 +209,13 @@ def __init__(self,
inflate: bool = True,
inflate_style: str = '3x1x1',
non_local: bool = False,
- non_local_cfg: ConfigType = dict(),
- conv_cfg: ConfigType = dict(type='Conv3d'),
- norm_cfg: ConfigType = dict(type='BN3d'),
- act_cfg: ConfigType = dict(type='ReLU'),
- with_cp: bool = False) -> None:
- super().__init__()
+ non_local_cfg: Dict = dict(),
+ conv_cfg: Dict = dict(type='Conv3d'),
+ norm_cfg: Dict = dict(type='BN3d'),
+ act_cfg: Dict = dict(type='ReLU'),
+ with_cp: bool = False,
+ init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
+ super().__init__(init_cfg=init_cfg)
assert style in ['pytorch', 'caffe']
assert inflate_style in ['3x1x1', '3x3x3']
@@ -297,7 +304,7 @@ def __init__(self,
self.non_local_block = NonLocal3d(self.conv3.norm.num_features,
**self.non_local_cfg)
- def forward(self, x: Tensor) -> Tensor:
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Defines the computation performed at every call."""
def _inner_forward(x):
@@ -327,23 +334,23 @@ def _inner_forward(x):
@MODELS.register_module()
-class ResNet3d(nn.Module):
+class ResNet3d(BaseModule):
"""ResNet 3d backbone.
Args:
- depth (int): Depth of resnet, from
- {``18``, ``34``, ``50``, ``101``, ``152``}.
+ depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+ Defaults to 50.
pretrained (str, optional): Name of pretrained model. Defaults to None.
stage_blocks (tuple, optional): Set number of stages for each res
layer. Defaults to None.
pretrained2d (bool): Whether to load pretrained 2D model.
Defaults to True.
in_channels (int): Channel num of input features. Defaults to 3.
+ num_stages (int): Resnet stages. Defaults to 4.
base_channels (int): Channel num of stem output features.
Defaults to 64.
out_indices (Sequence[int]): Indices of output feature.
- Defaults to ```(3, )``.
- num_stages (int): Resnet stages. Defaults to 4.
+ Defaults to ``(3, )``.
spatial_strides (Sequence[int]):
Spatial strides of residual blocks of each stage.
Defaults to ``(1, 2, 2, 2)``.
@@ -363,9 +370,9 @@ class ResNet3d(nn.Module):
pool1_stride_t (int): Temporal stride of the first pooling layer.
Defaults to 1.
with_pool2 (bool): Whether to use pool2. Defaults to True.
- style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+ style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
stride-two layer is the 3x3 conv layer, otherwise the stride-two
- layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+ layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
frozen_stages (int): Stages to be frozen (all param fixed). -1 means
not freezing any parameters. Defaults to -1.
inflate (Sequence[int]): Inflate Dims of each block.
@@ -373,12 +380,12 @@ class ResNet3d(nn.Module):
inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
kernel sizes and padding strides for conv1 and conv2 in each block.
Defaults to ``3x1x1``.
- conv_cfg (dict or ConfigDict): Config for conv layers.
+ conv_cfg (dict): Config for conv layers.
Required keys are ``type``. Defaults to ``dict(type='Conv3d')``.
- norm_cfg (dict or ConfigDict): Config for norm layers.
+ norm_cfg (dict): Config for norm layers.
Required keys are ``type`` and ``requires_grad``.
Defaults to ``dict(type='BN3d', requires_grad=True)``.
- act_cfg (dict or ConfigDict): Config dict for activation layer.
+ act_cfg (dict): Config dict for activation layer.
Defaults to ``dict(type='ReLU', inplace=True)``.
norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
running stats (``mean`` and ``var``). Defaults to False.
@@ -387,11 +394,13 @@ class ResNet3d(nn.Module):
non_local (Sequence[int]): Determine whether to apply non-local module
in the corresponding block of each stages.
Defaults to ``(0, 0, 0, 0)``.
- non_local_cfg (dict or ConfigDict): Config for non-local module.
+ non_local_cfg (dict): Config for non-local module.
Defaults to ``dict()``.
zero_init_residual (bool):
Whether to use zero initialization for residual block,
Defaults to True.
+ init_cfg (dict or list[dict], optional): Initialization config dict.
+ Defaults to None.
"""
arch_settings = {
@@ -403,7 +412,7 @@ class ResNet3d(nn.Module):
}
def __init__(self,
- depth: int,
+ depth: int = 50,
pretrained: Optional[str] = None,
stage_blocks: Optional[Tuple] = None,
pretrained2d: bool = True,
@@ -425,16 +434,17 @@ def __init__(self,
frozen_stages: int = -1,
inflate: Sequence[int] = (1, 1, 1, 1),
inflate_style: str = '3x1x1',
- conv_cfg: ConfigType = dict(type='Conv3d'),
- norm_cfg: ConfigType = dict(type='BN3d', requires_grad=True),
- act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+ conv_cfg: Dict = dict(type='Conv3d'),
+ norm_cfg: Dict = dict(type='BN3d', requires_grad=True),
+ act_cfg: Dict = dict(type='ReLU', inplace=True),
norm_eval: bool = False,
with_cp: bool = False,
non_local: Sequence[int] = (0, 0, 0, 0),
- non_local_cfg: ConfigType = dict(),
+ non_local_cfg: Dict = dict(),
zero_init_residual: bool = True,
+ init_cfg: Optional[Union[Dict, List[Dict]]] = None,
**kwargs) -> None:
- super().__init__()
+ super().__init__(init_cfg=init_cfg)
if depth not in self.arch_settings:
raise KeyError(f'invalid depth {depth} for resnet')
self.depth = depth
@@ -486,6 +496,8 @@ def __init__(self,
self._make_stem_layer()
self.res_layers = []
+ lateral_inplanes = getattr(self, 'lateral_inplanes', [0, 0, 0, 0])
+
for i, num_blocks in enumerate(self.stage_blocks):
spatial_stride = spatial_strides[i]
temporal_stride = temporal_strides[i]
@@ -493,7 +505,7 @@ def __init__(self,
planes = self.base_channels * 2**i
res_layer = self.make_res_layer(
self.block,
- self.inplanes,
+ self.inplanes + lateral_inplanes[i],
planes,
num_blocks,
spatial_stride=spatial_stride,
@@ -514,8 +526,8 @@ def __init__(self,
self.add_module(layer_name, res_layer)
self.res_layers.append(layer_name)
- self.feat_dim = self.block.expansion * self.base_channels * 2**(
- len(self.stage_blocks) - 1)
+ self.feat_dim = self.block.expansion * \
+ self.base_channels * 2 ** (len(self.stage_blocks) - 1)
@staticmethod
def make_res_layer(block: nn.Module,
@@ -529,11 +541,11 @@ def make_res_layer(block: nn.Module,
inflate: Union[int, Sequence[int]] = 1,
inflate_style: str = '3x1x1',
non_local: Union[int, Sequence[int]] = 0,
- non_local_cfg: ConfigType = dict(),
- norm_cfg: OptConfigType = None,
- act_cfg: OptConfigType = None,
- conv_cfg: OptConfigType = None,
- with_cp: Optional[bool] = False,
+ non_local_cfg: Dict = dict(),
+ norm_cfg: Optional[Dict] = None,
+ act_cfg: Optional[Dict] = None,
+ conv_cfg: Optional[Dict] = None,
+ with_cp: bool = False,
**kwargs) -> nn.Module:
"""Build residual layer for ResNet3D.
@@ -549,25 +561,25 @@ def make_res_layer(block: nn.Module,
temporal_stride (int | Sequence[int]): Temporal strides in
residual and conv layers. Defaults to 1.
dilation (int): Spacing between kernel elements. Defaults to 1.
- style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``,
- the stride-two layer is the 3x3 conv layer, otherwise
- the stride-two layer is the first 1x1 conv layer.
- Default: ``pytorch``.
+ style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
+ stride-two layer is the 3x3 conv layer,otherwise the
+ stride-two layer is the first 1x1 conv layer.
+ Defaults to ``'pytorch'``.
inflate (int | Sequence[int]): Determine whether to inflate
for each block. Defaults to 1.
inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines
the kernel sizes and padding strides for conv1 and conv2
- in each block. Default: ``3x1x1``.
+ in each block. Default: ``'3x1x1'``.
non_local (int | Sequence[int]): Determine whether to apply
non-local module in the corresponding block of each stages.
Defaults to 0.
non_local_cfg (dict): Config for non-local module.
Defaults to ``dict()``.
- conv_cfg (dict or ConfigDict, optional): Config for conv layers.
+ conv_cfg (dict, optional): Config for conv layers.
Defaults to None.
- norm_cfg (dict or ConfigDict, optional): Config for norm layers.
+ norm_cfg (dict, optional): Config for norm layers.
Defaults to None.
- act_cfg (dict or ConfigDict, optional): Config for activate layers.
+ act_cfg (dict, optional): Config for activate layers.
Defaults to None.
with_cp (bool, optional): Use checkpoint or not. Using checkpoint
will save some memory while slowing down the training speed.
@@ -576,10 +588,10 @@ def make_res_layer(block: nn.Module,
Returns:
nn.Module: A residual layer for the given config.
"""
- inflate = inflate if not isinstance(inflate,
- int) else (inflate, ) * blocks
- non_local = non_local if not isinstance(
- non_local, int) else (non_local, ) * blocks
+ inflate = inflate if not isinstance(inflate, int) \
+ else (inflate,) * blocks
+ non_local = non_local if not isinstance(non_local, int) \
+ else (non_local,) * blocks
assert len(inflate) == blocks and len(non_local) == blocks
downsample = None
if spatial_stride != 1 or inplanes != planes * block.expansion:
@@ -632,7 +644,7 @@ def make_res_layer(block: nn.Module,
with_cp=with_cp,
**kwargs))
- return nn.Sequential(*layers)
+ return Sequential(*layers)
@staticmethod
def _inflate_conv_params(conv3d: nn.Module, state_dict_2d: OrderedDict,
@@ -645,7 +657,7 @@ def _inflate_conv_params(conv3d: nn.Module, state_dict_2d: OrderedDict,
state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
module_name_2d (str): The name of corresponding conv module in the
2d model.
- inflated_param_names (List[str]): List of parameters that have been
+ inflated_param_names (list[str]): List of parameters that have been
inflated.
"""
weight_2d_name = module_name_2d + '.weight'
@@ -674,7 +686,7 @@ def _inflate_bn_params(bn3d: nn.Module, state_dict_2d: OrderedDict,
state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
module_name_2d (str): The name of corresponding bn module in the
2d model.
- inflated_param_names (List[str]): List of parameters that have been
+ inflated_param_names (list[str]): List of parameters that have been
inflated.
"""
for param_name, param in bn3d.named_parameters():
@@ -711,7 +723,7 @@ def _inflate_weights(self, logger: MMLogger) -> None:
debugging information.
"""
- state_dict_r2d = _load_checkpoint(self.pretrained)
+ state_dict_r2d = _load_checkpoint(self.pretrained, map_location='cpu')
if 'state_dict' in state_dict_r2d:
state_dict_r2d = state_dict_r2d['state_dict']
@@ -811,7 +823,7 @@ def _init_weights(self, pretrained: Optional[str] = None) -> None:
Args:
pretrained (str | None): The path of the pretrained weight. Will
override the original `pretrained` if set. The arg is added to
- be compatible with mmdet. Default: None.
+ be compatible with mmdet. Defaults to None.
"""
if pretrained:
self.pretrained = pretrained
@@ -822,7 +834,6 @@ def _init_weights(self, pretrained: Optional[str] = None) -> None:
if self.pretrained2d:
# Inflate 2D model into 3D model.
self.inflate_weights(logger)
-
else:
# Directly load 3D model.
load_checkpoint(
@@ -848,15 +859,16 @@ def init_weights(self, pretrained: Optional[str] = None) -> None:
"""Initialize weights."""
self._init_weights(self, pretrained)
- def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
+ def forward(self, x: torch.Tensor) \
+ -> Union[torch.Tensor, Tuple[torch.Tensor]]:
"""Defines the computation performed at every call.
Args:
- x (Tensor): The input data.
+ x (torch.Tensor): The input data.
Returns:
- Tensor or Tuple[Tensor]: The feature of the input
- samples extracted by the backbone.
+ torch.Tensor or tuple[torch.Tensor]: The feature of the input
+ samples extracted by the backbone.
"""
x = self.conv1(x)
if self.with_pool1:
@@ -885,12 +897,11 @@ def train(self, mode: bool = True) -> None:
@MODELS.register_module()
-class ResNet3dLayer(nn.Module):
+class ResNet3dLayer(BaseModule):
"""ResNet 3d Layer.
Args:
- depth (int): Depth of resnet,
- from {``18``, ``34``, ``50``, ``101``, ``152``}.
+ depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
pretrained (str, optional): Name of pretrained model. Defaults to None.
pretrained2d (bool): Whether to load pretrained 2D model.
Defaults to True.
@@ -902,20 +913,20 @@ class ResNet3dLayer(nn.Module):
temporal_stride (int): The 1st res block's temporal stride.
Defaults to 1.
dilation (int): The dilation. Defaults to 1.
- style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``, the
+ style (str): 'pytorch' or 'caffe'. If set to 'pytorch', the
stride-two layer is the 3x3 conv layer, otherwise the stride-two
- layer is the first 1x1 conv layer. Defaults to ``pytorch``.
+ layer is the first 1x1 conv layer. Defaults to ``'pytorch'``.
all_frozen (bool): Frozen all modules in the layer. Defaults to False.
inflate (int): Inflate dims of each block. Defaults to 1.
inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines the
kernel sizes and padding strides for conv1 and conv2 in each block.
- Defaults to ``3x1x1``.
- conv_cfg (dict or ConfigDict): Config for conv layers.
+ Defaults to ``'3x1x1'``.
+ conv_cfg (dict): Config for conv layers.
Required keys are ``type``. Defaults to ``dict(type='Conv3d')``.
- norm_cfg (dict or ConfigDict): Config for norm layers.
+ norm_cfg (dict): Config for norm layers.
Required keys are ``type`` and ``requires_grad``.
Defaults to ``dict(type='BN3d', requires_grad=True)``.
- act_cfg (dict or ConfigDict): Config dict for activation layer.
+ act_cfg (dict): Config dict for activation layer.
Defaults to ``dict(type='ReLU', inplace=True)``.
norm_eval (bool): Whether to set BN layers to eval mode, namely, freeze
running stats (``mean`` and ``var``). Defaults to False.
@@ -924,6 +935,8 @@ class ResNet3dLayer(nn.Module):
zero_init_residual (bool):
Whether to use zero initialization for residual block,
Defaults to True.
+ init_cfg (dict or list[dict], optional): Initialization config dict.
+ Defaults to None.
"""
def __init__(self,
@@ -939,14 +952,15 @@ def __init__(self,
all_frozen: bool = False,
inflate: int = 1,
inflate_style: str = '3x1x1',
- conv_cfg: ConfigType = dict(type='Conv3d'),
- norm_cfg: ConfigType = dict(type='BN3d', requires_grad=True),
- act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+ conv_cfg: Dict = dict(type='Conv3d'),
+ norm_cfg: Dict = dict(type='BN3d', requires_grad=True),
+ act_cfg: Dict = dict(type='ReLU', inplace=True),
norm_eval: bool = False,
with_cp: bool = False,
zero_init_residual: bool = True,
+ init_cfg: Optional[Union[Dict, List[Dict]]] = None,
**kwargs) -> None:
- super().__init__()
+ super().__init__(init_cfg=init_cfg)
self.arch_settings = ResNet3d.arch_settings
assert depth in self.arch_settings
@@ -1022,15 +1036,15 @@ def init_weights(self, pretrained: Optional[str] = None) -> None:
"""Initialize weights."""
self._init_weights(self, pretrained)
- def forward(self, x: Tensor) -> Tensor:
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Defines the computation performed at every call.
Args:
- x (Tensor): The input data.
+ x (torch.Tensor): The input data.
Returns:
- Tensor: The feature of the input
- samples extracted by the resisual layer.
+ torch.Tensor: The feature of the input
+ samples extracted by the residual layer.
"""
res_layer = getattr(self, self.layer_name)
out = res_layer(x)
diff --git a/mmaction/models/backbones/resnet3d_slowfast.py b/mmaction/models/backbones/resnet3d_slowfast.py
index 4417882c4b..3083239ff9 100644
--- a/mmaction/models/backbones/resnet3d_slowfast.py
+++ b/mmaction/models/backbones/resnet3d_slowfast.py
@@ -1,27 +1,88 @@
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from collections import OrderedDict
-from typing import List, Optional, Sequence, Union
+from typing import Dict, List, Optional, Tuple, Union
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule
from mmengine.logging import MMLogger, print_log
+from mmengine.model import BaseModule
from mmengine.model.weight_init import kaiming_init
from mmengine.runner.checkpoint import _load_checkpoint, load_checkpoint
-from torch import Tensor
from mmaction.registry import MODELS
-from mmaction.utils import ConfigType, OptConfigType
from .resnet3d import ResNet3d
+class DeConvModule(BaseModule):
+ """A deconv module that bundles deconv/norm/activation layers.
+
+ Args:
+ in_channels (int): Number of channels in the input feature map.
+ out_channels (int): Number of channels produced by the convolution.
+ kernel_size (int | tuple[int]): Size of the convolving kernel.
+ stride (int | tuple[int]): Stride of the convolution.
+ padding (int | tuple[int]): Zero-padding added to both sides of
+ the input.
+ bias (bool): Whether to add a learnable bias to the output.
+ Defaults to False.
+ with_bn (bool): Whether to add a BN layer. Defaults to True.
+ with_relu (bool): Whether to add a ReLU layer. Defaults to True.
+ """
+
+ def __init__(self,
+ in_channels: int,
+ out_channels: int,
+ kernel_size: int,
+ stride: Union[int, Tuple[int]] = (1, 1, 1),
+ padding: Union[int, Tuple[int]] = 0,
+ bias: bool = False,
+ with_bn: bool = True,
+ with_relu: bool = True) -> None:
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.stride = stride
+ self.padding = padding
+ self.bias = bias
+ self.with_bn = with_bn
+ self.with_relu = with_relu
+
+ self.conv = nn.ConvTranspose3d(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=stride,
+ padding=padding,
+ bias=bias)
+ self.bn = nn.BatchNorm3d(out_channels)
+ self.relu = nn.ReLU()
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """Defines the computation performed at every call."""
+ # x should be a 5-d tensor
+ assert len(x.shape) == 5
+ N, C, T, H, W = x.shape
+ out_shape = (N, self.out_channels, self.stride[0] * T,
+ self.stride[1] * H, self.stride[2] * W)
+ x = self.conv(x, output_size=out_shape)
+ if self.with_bn:
+ x = self.bn(x)
+ if self.with_relu:
+ x = self.relu(x)
+ return x
+
+
class ResNet3dPathway(ResNet3d):
"""A pathway of Slowfast based on ResNet3d.
Args:
lateral (bool): Determines whether to enable the lateral connection
from another pathway. Defaults to False.
+ lateral_inv (bool): Whether to use deconv to upscale the time
+ dimension of features from another pathway. Defaults to False.
lateral_norm (bool): Determines whether to enable the lateral norm
in lateral layers. Defaults to False.
speed_ratio (int): Speed ratio indicating the ratio between time
@@ -32,181 +93,112 @@ class ResNet3dPathway(ResNet3d):
Defaults to 8.
fusion_kernel (int): The kernel size of lateral fusion.
Defaults to 5.
+ lateral_infl (int): The ratio of the inflated channels.
+ Defaults to 2.
+ lateral_activate (list[int]): Flags for activating the lateral
+ connection. Defaults to ``[1, 1, 1, 1]``.
"""
def __init__(self,
- *args,
lateral: bool = False,
+ lateral_inv: bool = False,
lateral_norm: bool = False,
speed_ratio: int = 8,
channel_ratio: int = 8,
fusion_kernel: int = 5,
+ lateral_infl: int = 2,
+ lateral_activate: List[int] = [1, 1, 1, 1],
**kwargs) -> None:
self.lateral = lateral
+ self.lateral_inv = lateral_inv
self.lateral_norm = lateral_norm
self.speed_ratio = speed_ratio
self.channel_ratio = channel_ratio
self.fusion_kernel = fusion_kernel
- super().__init__(*args, **kwargs)
+ self.lateral_infl = lateral_infl
+ self.lateral_activate = lateral_activate
+ self._calculate_lateral_inplanes(kwargs)
+
+ super().__init__(**kwargs)
self.inplanes = self.base_channels
- if self.lateral:
- self.conv1_lateral = ConvModule(
- self.inplanes // self.channel_ratio,
- # https://arxiv.org/abs/1812.03982, the
- # third type of lateral connection has out_channel:
- # 2 * \beta * C
- self.inplanes * 2 // self.channel_ratio,
- kernel_size=(fusion_kernel, 1, 1),
- stride=(self.speed_ratio, 1, 1),
- padding=((fusion_kernel - 1) // 2, 0, 0),
- bias=False,
- conv_cfg=self.conv_cfg,
- norm_cfg=self.norm_cfg if self.lateral_norm else None,
- act_cfg=self.act_cfg if self.lateral_norm else None)
+ if self.lateral and self.lateral_activate[0] == 1:
+ if self.lateral_inv:
+ self.conv1_lateral = DeConvModule(
+ self.inplanes * self.channel_ratio,
+ self.inplanes * self.channel_ratio // lateral_infl,
+ kernel_size=(fusion_kernel, 1, 1),
+ stride=(self.speed_ratio, 1, 1),
+ padding=((fusion_kernel - 1) // 2, 0, 0),
+ with_bn=True,
+ with_relu=True)
+ else:
+ self.conv1_lateral = ConvModule(
+ self.inplanes // self.channel_ratio,
+ self.inplanes * lateral_infl // self.channel_ratio,
+ kernel_size=(fusion_kernel, 1, 1),
+ stride=(self.speed_ratio, 1, 1),
+ padding=((fusion_kernel - 1) // 2, 0, 0),
+ bias=False,
+ conv_cfg=self.conv_cfg,
+ norm_cfg=self.norm_cfg if self.lateral_norm else None,
+ act_cfg=self.act_cfg if self.lateral_norm else None)
self.lateral_connections = []
for i in range(len(self.stage_blocks)):
planes = self.base_channels * 2**i
self.inplanes = planes * self.block.expansion
- if lateral and i != self.num_stages - 1:
+ if lateral and i != self.num_stages - 1 \
+ and self.lateral_activate[i + 1]:
# no lateral connection needed in final stage
lateral_name = f'layer{(i + 1)}_lateral'
- setattr(
- self, lateral_name,
- ConvModule(
+ if self.lateral_inv:
+ conv_module = DeConvModule(
+ self.inplanes * self.channel_ratio,
+ self.inplanes * self.channel_ratio // lateral_infl,
+ kernel_size=(fusion_kernel, 1, 1),
+ stride=(self.speed_ratio, 1, 1),
+ padding=((fusion_kernel - 1) // 2, 0, 0),
+ bias=False,
+ with_bn=True,
+ with_relu=True)
+ else:
+ conv_module = ConvModule(
self.inplanes // self.channel_ratio,
- self.inplanes * 2 // self.channel_ratio,
+ self.inplanes * lateral_infl // self.channel_ratio,
kernel_size=(fusion_kernel, 1, 1),
stride=(self.speed_ratio, 1, 1),
padding=((fusion_kernel - 1) // 2, 0, 0),
bias=False,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg if self.lateral_norm else None,
- act_cfg=self.act_cfg if self.lateral_norm else None))
+ act_cfg=self.act_cfg if self.lateral_norm else None)
+ setattr(self, lateral_name, conv_module)
self.lateral_connections.append(lateral_name)
- def make_res_layer(self,
- block: nn.Module,
- inplanes: int,
- planes: int,
- blocks: int,
- spatial_stride: Union[int, Sequence[int]] = 1,
- temporal_stride: Union[int, Sequence[int]] = 1,
- dilation: int = 1,
- style: str = 'pytorch',
- inflate: Union[int, Sequence[int]] = 1,
- inflate_style: str = '3x1x1',
- non_local: Union[int, Sequence[int]] = 0,
- non_local_cfg: ConfigType = dict(),
- norm_cfg: OptConfigType = None,
- act_cfg: OptConfigType = None,
- conv_cfg: OptConfigType = None,
- with_cp: Optional[bool] = False,
- **kwargs) -> nn.Module:
- """Build residual layer for SlowFast.
-
- Args:
- block (nn.Module): Residual module to be built.
- inplanes (int): Number of channels for the input feature
- in each block.
- planes (int): Number of channels for the output feature
- in each block.
- blocks (int): Number of residual blocks.
- spatial_stride (int | Sequence[int]): Spatial strides in
- residual and conv layers. Defaults to 1.
- temporal_stride (int | Sequence[int]): Temporal strides in
- residual and conv layers. Defaults to 1.
- dilation (int): Spacing between kernel elements. Defaults to 1.
- style (str): ``pytorch`` or ``caffe``. If set to ``pytorch``,
- the stride-two layer is the 3x3 conv layer, otherwise
- the stride-two layer is the first 1x1 conv layer.
- Default: ``pytorch``.
- inflate (int | Sequence[int]): Determine whether to inflate
- for each block. Defaults to 1.
- inflate_style (str): ``3x1x1`` or ``3x3x3``. which determines
- the kernel sizes and padding strides for conv1 and conv2
- in each block. Default: ``3x1x1``.
- non_local (int | Sequence[int]): Determine whether to apply
- non-local module in the corresponding block of each stages.
- Defaults to 0.
- non_local_cfg (dict): Config for non-local module.
- Defaults to ``dict()``.
- conv_cfg (dict or ConfigDict, optional): Config for conv layers.
- Defaults to None.
- norm_cfg (dict or ConfigDict, optional): Config for norm layers.
- Defaults to None.
- act_cfg (dict or ConfigDict, optional): Config for activate layers.
- Defaults to None.
- with_cp (bool, optional): Use checkpoint or not. Using checkpoint
- will save some memory while slowing down the training speed.
- Defaults to False.
-
- Returns:
- nn.Module: A residual layer for the given config.
- """
- inflate = inflate if not isinstance(inflate,
- int) else (inflate, ) * blocks
- non_local = non_local if not isinstance(
- non_local, int) else (non_local, ) * blocks
- assert len(inflate) == blocks and len(non_local) == blocks
- if self.lateral:
- lateral_inplanes = inplanes * 2 // self.channel_ratio
- else:
- lateral_inplanes = 0
- if (spatial_stride != 1
- or (inplanes + lateral_inplanes) != planes * block.expansion):
- downsample = ConvModule(
- inplanes + lateral_inplanes,
- planes * block.expansion,
- kernel_size=1,
- stride=(temporal_stride, spatial_stride, spatial_stride),
- bias=False,
- conv_cfg=conv_cfg,
- norm_cfg=norm_cfg,
- act_cfg=None)
- else:
- downsample = None
-
- layers = []
- layers.append(
- block(
- inplanes + lateral_inplanes,
- planes,
- spatial_stride,
- temporal_stride,
- dilation,
- downsample,
- style=style,
- inflate=(inflate[0] == 1),
- inflate_style=inflate_style,
- non_local=(non_local[0] == 1),
- non_local_cfg=non_local_cfg,
- conv_cfg=conv_cfg,
- norm_cfg=norm_cfg,
- act_cfg=act_cfg,
- with_cp=with_cp))
- inplanes = planes * block.expansion
-
- for i in range(1, blocks):
- layers.append(
- block(
- inplanes,
- planes,
- 1,
- 1,
- dilation,
- style=style,
- inflate=(inflate[i] == 1),
- inflate_style=inflate_style,
- non_local=(non_local[i] == 1),
- non_local_cfg=non_local_cfg,
- conv_cfg=conv_cfg,
- norm_cfg=norm_cfg,
- act_cfg=act_cfg,
- with_cp=with_cp))
-
- return nn.Sequential(*layers)
+ def _calculate_lateral_inplanes(self, kwargs):
+ """Calculate inplanes for lateral connection."""
+ depth = kwargs.get('depth', 50)
+ expansion = 1 if depth < 50 else 4
+ base_channels = kwargs.get('base_channels', 64)
+ lateral_inplanes = []
+ for i in range(kwargs.get('num_stages', 4)):
+ if expansion % 2 == 0:
+ planes = base_channels * (2 ** i) * \
+ ((expansion // 2) ** (i > 0))
+ else:
+ planes = base_channels * (2**i) // (2**(i > 0))
+ if self.lateral and self.lateral_activate[i]:
+ if self.lateral_inv:
+ lateral_inplane = planes * \
+ self.channel_ratio // self.lateral_infl
+ else:
+ lateral_inplane = planes * \
+ self.lateral_infl // self.channel_ratio
+ else:
+ lateral_inplane = 0
+ lateral_inplanes.append(lateral_inplane)
+ self.lateral_inplanes = lateral_inplanes
def inflate_weights(self, logger: MMLogger) -> None:
"""Inflate the resnet2d parameters to resnet3d pathway.
@@ -222,7 +214,7 @@ def inflate_weights(self, logger: MMLogger) -> None:
debugging information.
"""
- state_dict_r2d = _load_checkpoint(self.pretrained)
+ state_dict_r2d = _load_checkpoint(self.pretrained, map_location='cpu')
if 'state_dict' in state_dict_r2d:
state_dict_r2d = state_dict_r2d['state_dict']
@@ -280,7 +272,7 @@ def _inflate_conv_params(self, conv3d: nn.Module,
state_dict_2d (OrderedDict): The state dict of pretrained 2d model.
module_name_2d (str): The name of corresponding conv module in the
2d model.
- inflated_param_names (List[str]): List of parameters that have been
+ inflated_param_names (list[str]): List of parameters that have been
inflated.
"""
weight_2d_name = module_name_2d + '.weight'
@@ -358,11 +350,11 @@ def init_weights(self, pretrained: Optional[str] = None) -> None:
}
-def build_pathway(cfg: ConfigType, *args, **kwargs) -> nn.Module:
+def build_pathway(cfg: Dict, *args, **kwargs) -> nn.Module:
"""Build pathway.
Args:
- cfg (dict or ConfigDict): cfg should contain:
+ cfg (dict): cfg should contain:
- type (str): identify backbone type.
Returns:
@@ -383,7 +375,7 @@ def build_pathway(cfg: ConfigType, *args, **kwargs) -> nn.Module:
@MODELS.register_module()
-class ResNet3dSlowFast(nn.Module):
+class ResNet3dSlowFast(BaseModule):
"""Slowfast backbone.
This module is proposed in `SlowFast Networks for Video Recognition
@@ -403,57 +395,43 @@ class ResNet3dSlowFast(nn.Module):
channel_ratio (int): Reduce the channel number of fast pathway
by ``channel_ratio``, corresponding to :math:`\\beta` in the paper.
Defaults to 8.
- slow_pathway (dict or ConfigDict): Configuration of slow branch, should
- contain necessary arguments for building the specific type of
- pathway and:
- type (str): type of backbone the pathway bases on.
- lateral (bool): determine whether to build lateral connection
- for the pathway. Defaults to
-
- .. code-block:: Python
-
- dict(type='ResNetPathway',
- lateral=True, depth=50, pretrained=None,
- conv1_kernel=(1, 7, 7), dilations=(1, 1, 1, 1),
- conv1_stride_t=1, pool1_stride_t=1, inflate=(0, 0, 1, 1))
-
- fast_pathway (dict or ConfigDict): Configuration of fast branch,
- similar to ``slow_pathway``. Defaults to
-
- .. code-block:: Python
-
- dict(type='ResNetPathway',
- lateral=False, depth=50, pretrained=None, base_channels=8,
- conv1_kernel=(5, 7, 7), conv1_stride_t=1, pool1_stride_t=1)
+ slow_pathway (dict): Configuration of slow branch. Defaults to
+ ``dict(type='resnet3d', depth=50, pretrained=None, lateral=True,
+ conv1_kernel=(1, 7, 7), conv1_stride_t=1, pool1_stride_t=1,
+ inflate=(0, 0, 1, 1))``.
+ fast_pathway (dict): Configuration of fast branch. Defaults to
+ ``dict(type='resnet3d', depth=50, pretrained=None, lateral=False,
+ base_channels=8, conv1_kernel=(5, 7, 7), conv1_stride_t=1,
+ pool1_stride_t=1)``.
+ init_cfg (dict or list[dict], optional): Initialization config dict.
+ Defaults to None.
"""
- def __init__(
- self,
- pretrained,
- resample_rate: int = 8,
- speed_ratio: int = 8,
- channel_ratio: int = 8,
- slow_pathway: ConfigType = dict(
- type='resnet3d',
- depth=50,
- pretrained=None,
- lateral=True,
- conv1_kernel=(1, 7, 7),
- dilations=(1, 1, 1, 1),
- conv1_stride_t=1,
- pool1_stride_t=1,
- inflate=(0, 0, 1, 1)),
- fast_pathway: ConfigType = dict(
- type='resnet3d',
- depth=50,
- pretrained=None,
- lateral=False,
- base_channels=8,
- conv1_kernel=(5, 7, 7),
- conv1_stride_t=1,
- pool1_stride_t=1)
- ) -> None:
- super().__init__()
+ def __init__(self,
+ pretrained: Optional[str] = None,
+ resample_rate: int = 8,
+ speed_ratio: int = 8,
+ channel_ratio: int = 8,
+ slow_pathway: Dict = dict(
+ type='resnet3d',
+ depth=50,
+ pretrained=None,
+ lateral=True,
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1,
+ inflate=(0, 0, 1, 1)),
+ fast_pathway: Dict = dict(
+ type='resnet3d',
+ depth=50,
+ pretrained=None,
+ lateral=False,
+ base_channels=8,
+ conv1_kernel=(5, 7, 7),
+ conv1_stride_t=1,
+ pool1_stride_t=1),
+ init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
+ super().__init__(init_cfg=init_cfg)
self.pretrained = pretrained
self.resample_rate = resample_rate
self.speed_ratio = speed_ratio
@@ -485,15 +463,15 @@ def init_weights(self, pretrained: Optional[str] = None) -> None:
else:
raise TypeError('pretrained must be a str or None')
- def forward(self, x: Tensor) -> tuple:
+ def forward(self, x: torch.Tensor) -> tuple:
"""Defines the computation performed at every call.
Args:
- x (Tensor): The input data.
+ x (torch.Tensor): The input data.
Returns:
- Tuple[Tensor]: The feature of the input samples extracted
- by the backbone.
+ tuple[torch.Tensor]: The feature of the input samples
+ extracted by the backbone.
"""
x_slow = nn.functional.interpolate(
x,
diff --git a/mmaction/models/backbones/resnet3d_slowonly.py b/mmaction/models/backbones/resnet3d_slowonly.py
index 819063c0cd..5c1c71c4c2 100644
--- a/mmaction/models/backbones/resnet3d_slowonly.py
+++ b/mmaction/models/backbones/resnet3d_slowonly.py
@@ -4,20 +4,12 @@
from mmaction.registry import MODELS
from .resnet3d_slowfast import ResNet3dPathway
-try:
- from mmdet.registry import MODELS as MMDET_MODELS
- mmdet_imported = True
-except (ImportError, ModuleNotFoundError):
- mmdet_imported = False
-
@MODELS.register_module()
class ResNet3dSlowOnly(ResNet3dPathway):
"""SlowOnly backbone based on ResNet3dPathway.
Args:
- lateral (bool): Determines whether to enable the lateral connection
- from another pathway. Defaults to False.
conv1_kernel (Sequence[int]): Kernel size of the first conv layer.
Defaults to ``(1, 7, 7)``.
conv1_stride_t (int): Temporal stride of the first conv layer.
@@ -30,8 +22,6 @@ class ResNet3dSlowOnly(ResNet3dPathway):
"""
def __init__(self,
- *args,
- lateral: bool = False,
conv1_kernel: Sequence[int] = (1, 7, 7),
conv1_stride_t: int = 1,
pool1_stride_t: int = 1,
@@ -39,8 +29,6 @@ def __init__(self,
with_pool2: bool = False,
**kwargs) -> None:
super().__init__(
- *args,
- lateral=lateral,
conv1_kernel=conv1_kernel,
conv1_stride_t=conv1_stride_t,
pool1_stride_t=pool1_stride_t,
@@ -49,7 +37,3 @@ def __init__(self,
**kwargs)
assert not self.lateral
-
-
-if mmdet_imported:
- MMDET_MODELS.register_module()(ResNet3dSlowOnly)
diff --git a/mmaction/models/backbones/resnet_tsm.py b/mmaction/models/backbones/resnet_tsm.py
index 1397384a97..c639e1eae6 100644
--- a/mmaction/models/backbones/resnet_tsm.py
+++ b/mmaction/models/backbones/resnet_tsm.py
@@ -165,6 +165,7 @@ def __init__(self,
self.non_local = non_local
self.non_local_stages = _ntuple(self.num_stages)(non_local)
self.non_local_cfg = non_local_cfg
+ # TODO use convert key to load weights
super().init_weights()
self.init_structure()
diff --git a/mmaction/models/backbones/rgbposeconv3d.py b/mmaction/models/backbones/rgbposeconv3d.py
new file mode 100644
index 0000000000..6f54e3b6b5
--- /dev/null
+++ b/mmaction/models/backbones/rgbposeconv3d.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmengine.logging import MMLogger, print_log
+from mmengine.model import BaseModule
+from mmengine.model.weight_init import constant_init, kaiming_init
+from mmengine.runner.checkpoint import load_checkpoint
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmaction.registry import MODELS
+from .resnet3d_slowfast import ResNet3dPathway
+
+
+@MODELS.register_module()
+class RGBPoseConv3D(BaseModule):
+ """RGBPoseConv3D backbone.
+
+ Args:
+ pretrained (str): The file path to a pretrained model.
+ Defaults to None.
+ speed_ratio (int): Speed ratio indicating the ratio between time
+ dimension of the fast and slow pathway, corresponding to the
+ :math:`\\alpha` in the paper. Defaults to 4.
+ channel_ratio (int): Reduce the channel number of fast pathway
+ by ``channel_ratio``, corresponding to :math:`\\beta` in the paper.
+ Defaults to 4.
+ rgb_detach (bool): Whether to detach the gradients from the pose path.
+ Defaults to False.
+ pose_detach (bool): Whether to detach the gradients from the rgb path.
+ Defaults to False.
+ rgb_drop_path (float): The drop rate for dropping the features from
+ the pose path. Defaults to 0.
+ pose_drop_path (float): The drop rate for dropping the features from
+ the rgb path. Defaults to 0.
+ rgb_pathway (dict): Configuration of rgb branch. Defaults to
+ ``dict(num_stages=4, lateral=True, lateral_infl=1,
+ lateral_activate=(0, 0, 1, 1), fusion_kernel=7, base_channels=64,
+ conv1_kernel=(1, 7, 7), inflate=(0, 0, 1, 1), with_pool2=False)``.
+ pose_pathway (dict): Configuration of pose branch. Defaults to
+ ``dict(num_stages=3, stage_blocks=(4, 6, 3), lateral=True,
+ lateral_inv=True, lateral_infl=16, lateral_activate=(0, 1, 1),
+ fusion_kernel=7, in_channels=17, base_channels=32,
+ out_indices=(2, ), conv1_kernel=(1, 7, 7), conv1_stride_s=1,
+ conv1_stride_t=1, pool1_stride_s=1, pool1_stride_t=1,
+ inflate=(0, 1, 1), spatial_strides=(2, 2, 2),
+ temporal_strides=(1, 1, 1), with_pool2=False)``.
+ init_cfg (dict or list[dict], optional): Initialization config dict.
+ Defaults to None.
+ """
+
+ def __init__(self,
+ pretrained: Optional[str] = None,
+ speed_ratio: int = 4,
+ channel_ratio: int = 4,
+ rgb_detach: bool = False,
+ pose_detach: bool = False,
+ rgb_drop_path: float = 0,
+ pose_drop_path: float = 0,
+ rgb_pathway: Dict = dict(
+ num_stages=4,
+ lateral=True,
+ lateral_infl=1,
+ lateral_activate=(0, 0, 1, 1),
+ fusion_kernel=7,
+ base_channels=64,
+ conv1_kernel=(1, 7, 7),
+ inflate=(0, 0, 1, 1),
+ with_pool2=False),
+ pose_pathway: Dict = dict(
+ num_stages=3,
+ stage_blocks=(4, 6, 3),
+ lateral=True,
+ lateral_inv=True,
+ lateral_infl=16,
+ lateral_activate=(0, 1, 1),
+ fusion_kernel=7,
+ in_channels=17,
+ base_channels=32,
+ out_indices=(2, ),
+ conv1_kernel=(1, 7, 7),
+ conv1_stride_s=1,
+ conv1_stride_t=1,
+ pool1_stride_s=1,
+ pool1_stride_t=1,
+ inflate=(0, 1, 1),
+ spatial_strides=(2, 2, 2),
+ temporal_strides=(1, 1, 1),
+ dilations=(1, 1, 1),
+ with_pool2=False),
+ init_cfg: Optional[Union[Dict, List[Dict]]] = None) -> None:
+ super().__init__(init_cfg=init_cfg)
+ self.pretrained = pretrained
+ self.speed_ratio = speed_ratio
+ self.channel_ratio = channel_ratio
+
+ if rgb_pathway['lateral']:
+ rgb_pathway['speed_ratio'] = speed_ratio
+ rgb_pathway['channel_ratio'] = channel_ratio
+
+ if pose_pathway['lateral']:
+ pose_pathway['speed_ratio'] = speed_ratio
+ pose_pathway['channel_ratio'] = channel_ratio
+
+ self.rgb_path = ResNet3dPathway(**rgb_pathway)
+ self.pose_path = ResNet3dPathway(**pose_pathway)
+ self.rgb_detach = rgb_detach
+ self.pose_detach = pose_detach
+ assert 0 <= rgb_drop_path <= 1
+ assert 0 <= pose_drop_path <= 1
+ self.rgb_drop_path = rgb_drop_path
+ self.pose_drop_path = pose_drop_path
+
+ def init_weights(self) -> None:
+ """Initiate the parameters either from existing checkpoint or from
+ scratch."""
+ for m in self.modules():
+ if isinstance(m, nn.Conv3d):
+ kaiming_init(m)
+ elif isinstance(m, _BatchNorm):
+ constant_init(m, 1)
+
+ if isinstance(self.pretrained, str):
+ logger = MMLogger.get_current_instance()
+ msg = f'load model from: {self.pretrained}'
+ print_log(msg, logger=logger)
+ load_checkpoint(self, self.pretrained, strict=True, logger=logger)
+ elif self.pretrained is None:
+ # Init two branch separately.
+ self.rgb_path.init_weights()
+ self.pose_path.init_weights()
+ else:
+ raise TypeError('pretrained must be a str or None')
+
+ def forward(self, imgs: torch.Tensor, heatmap_imgs: torch.Tensor) -> tuple:
+ """Defines the computation performed at every call.
+
+ Args:
+ imgs (torch.Tensor): The input data.
+ heatmap_imgs (torch.Tensor): The input data.
+
+ Returns:
+ tuple[torch.Tensor]: The feature of the input
+ samples extracted by the backbone.
+ """
+ if self.training:
+ rgb_drop_path = torch.rand(1) < self.rgb_drop_path
+ pose_drop_path = torch.rand(1) < self.pose_drop_path
+ else:
+ rgb_drop_path, pose_drop_path = False, False
+ # We assume base_channel for RGB and Pose are 64 and 32.
+ x_rgb = self.rgb_path.conv1(imgs)
+ x_rgb = self.rgb_path.maxpool(x_rgb)
+ # N x 64 x 8 x 56 x 56
+ x_pose = self.pose_path.conv1(heatmap_imgs)
+ x_pose = self.pose_path.maxpool(x_pose)
+
+ x_rgb = self.rgb_path.layer1(x_rgb)
+ x_rgb = self.rgb_path.layer2(x_rgb)
+ x_pose = self.pose_path.layer1(x_pose)
+
+ if hasattr(self.rgb_path, 'layer2_lateral'):
+ feat = x_pose.detach() if self.rgb_detach else x_pose
+ x_pose_lateral = self.rgb_path.layer2_lateral(feat)
+ if rgb_drop_path:
+ x_pose_lateral = x_pose_lateral.new_zeros(x_pose_lateral.shape)
+
+ if hasattr(self.pose_path, 'layer1_lateral'):
+ feat = x_rgb.detach() if self.pose_detach else x_rgb
+ x_rgb_lateral = self.pose_path.layer1_lateral(feat)
+ if pose_drop_path:
+ x_rgb_lateral = x_rgb_lateral.new_zeros(x_rgb_lateral.shape)
+
+ if hasattr(self.rgb_path, 'layer2_lateral'):
+ x_rgb = torch.cat((x_rgb, x_pose_lateral), dim=1)
+
+ if hasattr(self.pose_path, 'layer1_lateral'):
+ x_pose = torch.cat((x_pose, x_rgb_lateral), dim=1)
+
+ x_rgb = self.rgb_path.layer3(x_rgb)
+ x_pose = self.pose_path.layer2(x_pose)
+
+ if hasattr(self.rgb_path, 'layer3_lateral'):
+ feat = x_pose.detach() if self.rgb_detach else x_pose
+ x_pose_lateral = self.rgb_path.layer3_lateral(feat)
+ if rgb_drop_path:
+ x_pose_lateral = x_pose_lateral.new_zeros(x_pose_lateral.shape)
+
+ if hasattr(self.pose_path, 'layer2_lateral'):
+ feat = x_rgb.detach() if self.pose_detach else x_rgb
+ x_rgb_lateral = self.pose_path.layer2_lateral(feat)
+ if pose_drop_path:
+ x_rgb_lateral = x_rgb_lateral.new_zeros(x_rgb_lateral.shape)
+
+ if hasattr(self.rgb_path, 'layer3_lateral'):
+ x_rgb = torch.cat((x_rgb, x_pose_lateral), dim=1)
+
+ if hasattr(self.pose_path, 'layer2_lateral'):
+ x_pose = torch.cat((x_pose, x_rgb_lateral), dim=1)
+
+ x_rgb = self.rgb_path.layer4(x_rgb)
+ x_pose = self.pose_path.layer3(x_pose)
+
+ return x_rgb, x_pose
diff --git a/mmaction/models/backbones/timesformer.py b/mmaction/models/backbones/timesformer.py
index 618b381295..af636b5198 100644
--- a/mmaction/models/backbones/timesformer.py
+++ b/mmaction/models/backbones/timesformer.py
@@ -235,7 +235,7 @@ def init_weights(self, pretrained=None):
logger = MMLogger.get_current_instance()
logger.info(f'load model from: {self.pretrained}')
- state_dict = _load_checkpoint(self.pretrained)
+ state_dict = _load_checkpoint(self.pretrained, map_location='cpu')
if 'state_dict' in state_dict:
state_dict = state_dict['state_dict']
diff --git a/mmaction/models/backbones/uniformer.py b/mmaction/models/backbones/uniformer.py
index 97ac6184c1..5773313778 100644
--- a/mmaction/models/backbones/uniformer.py
+++ b/mmaction/models/backbones/uniformer.py
@@ -495,7 +495,7 @@ class UniFormer(BaseModule):
attn_drop_rate (float): Attention dropout rate. Defaults to 0.0.
drop_path_rate (float): Stochastic depth rates.
Defaults to 0.0.
- clip_pretrained (bool): Whether to load pretrained CLIP visual encoder.
+ pretrained2d (bool): Whether to load pretrained from 2D model.
Defaults to True.
pretrained (str): Name of pretrained model.
Defaults to None.
@@ -519,7 +519,7 @@ def __init__(
drop_rate: float = 0.,
attn_drop_rate: float = 0.,
drop_path_rate: float = 0.,
- clip_pretrained: bool = True,
+ pretrained2d: bool = True,
pretrained: Optional[str] = None,
init_cfg: Optional[Union[Dict, List[Dict]]] = [
dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
@@ -529,7 +529,7 @@ def __init__(
super().__init__(init_cfg=init_cfg)
self.pretrained = pretrained
- self.clip_pretrained = clip_pretrained
+ self.pretrained2d = pretrained2d
self.patch_embed1 = SpeicalPatchEmbed(
img_size=img_size,
patch_size=4,
@@ -641,7 +641,7 @@ def _load_pretrained(self, pretrained: str = None) -> None:
def init_weights(self):
"""Initialize the weights in backbone."""
- if self.clip_pretrained:
+ if self.pretrained2d:
logger = MMLogger.get_current_instance()
logger.info(f'load model from: {self.pretrained}')
self._load_pretrained(self.pretrained)
diff --git a/mmaction/models/backbones/uniformerv2.py b/mmaction/models/backbones/uniformerv2.py
index 64b0ba8faf..14571af5bd 100644
--- a/mmaction/models/backbones/uniformerv2.py
+++ b/mmaction/models/backbones/uniformerv2.py
@@ -548,23 +548,24 @@ def _load_pretrained(self, pretrained: str = None) -> None:
pretrained (str): Model name of pretrained CLIP visual encoder.
Defaults to None.
"""
- if pretrained is not None:
- model_path = _MODELS[pretrained]
- logger.info(f'Load CLIP pretrained model from {model_path}')
- state_dict = _load_checkpoint(model_path, map_location='cpu')
- state_dict_3d = self.state_dict()
- for k in state_dict.keys():
- if k in state_dict_3d.keys(
- ) and state_dict[k].shape != state_dict_3d[k].shape:
- if len(state_dict_3d[k].shape) <= 2:
- logger.info(f'Ignore: {k}')
- continue
- logger.info(f'Inflate: {k}, {state_dict[k].shape}' +
- f' => {state_dict_3d[k].shape}')
- time_dim = state_dict_3d[k].shape[2]
- state_dict[k] = self._inflate_weight(
- state_dict[k], time_dim)
- self.load_state_dict(state_dict, strict=False)
+ assert pretrained is not None, \
+ 'please specify clip pretraied checkpoint'
+
+ model_path = _MODELS[pretrained]
+ logger.info(f'Load CLIP pretrained model from {model_path}')
+ state_dict = _load_checkpoint(model_path, map_location='cpu')
+ state_dict_3d = self.state_dict()
+ for k in state_dict.keys():
+ if k in state_dict_3d.keys(
+ ) and state_dict[k].shape != state_dict_3d[k].shape:
+ if len(state_dict_3d[k].shape) <= 2:
+ logger.info(f'Ignore: {k}')
+ continue
+ logger.info(f'Inflate: {k}, {state_dict[k].shape}' +
+ f' => {state_dict_3d[k].shape}')
+ time_dim = state_dict_3d[k].shape[2]
+ state_dict[k] = self._inflate_weight(state_dict[k], time_dim)
+ self.load_state_dict(state_dict, strict=False)
def init_weights(self):
"""Initialize the weights in backbone."""
diff --git a/mmaction/models/data_preprocessors/__init__.py b/mmaction/models/data_preprocessors/__init__.py
index feccb87e2b..241f9b901a 100644
--- a/mmaction/models/data_preprocessors/__init__.py
+++ b/mmaction/models/data_preprocessors/__init__.py
@@ -1,4 +1,5 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .data_preprocessor import ActionDataPreprocessor
+from .multimodal_data_preprocessor import MultiModalDataPreprocessor
-__all__ = ['ActionDataPreprocessor']
+__all__ = ['ActionDataPreprocessor', 'MultiModalDataPreprocessor']
diff --git a/mmaction/models/data_preprocessors/data_preprocessor.py b/mmaction/models/data_preprocessors/data_preprocessor.py
index d2641bb6ab..5a11eefd3b 100644
--- a/mmaction/models/data_preprocessors/data_preprocessor.py
+++ b/mmaction/models/data_preprocessors/data_preprocessor.py
@@ -1,10 +1,11 @@
# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Sequence, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, Union
import torch
from mmengine.model import BaseDataPreprocessor, stack_batch
from mmaction.registry import MODELS
+from mmaction.utils.typing import SampleList
@MODELS.register_module()
@@ -12,13 +13,10 @@ class ActionDataPreprocessor(BaseDataPreprocessor):
"""Data pre-processor for action recognition tasks.
Args:
- mean (Sequence[float or int, optional): The pixel mean of channels
+ mean (Sequence[float or int], optional): The pixel mean of channels
of images or stacked optical flow. Defaults to None.
std (Sequence[float or int], optional): The pixel standard deviation
of channels of images or stacked optical flow. Defaults to None.
- pad_size_divisor (int): The size of padded image should be
- divisible by ``pad_size_divisor``. Defaults to 1.
- pad_value (float or int): The padded pixel value. Defaults to 0.
to_rgb (bool): Whether to convert image from BGR to RGB.
Defaults to False.
blending (dict, optional): Config for batch blending.
@@ -30,14 +28,10 @@ class ActionDataPreprocessor(BaseDataPreprocessor):
def __init__(self,
mean: Optional[Sequence[Union[float, int]]] = None,
std: Optional[Sequence[Union[float, int]]] = None,
- pad_size_divisor: int = 1,
- pad_value: Union[float, int] = 0,
to_rgb: bool = False,
blending: Optional[dict] = None,
format_shape: str = 'NCHW') -> None:
super().__init__()
- self.pad_size_divisor = pad_size_divisor
- self.pad_value = pad_value
self.to_rgb = to_rgb
self.format_shape = format_shape
@@ -49,7 +43,7 @@ def __init__(self,
self._enable_normalize = True
if self.format_shape == 'NCHW':
normalizer_shape = (-1, 1, 1)
- elif self.format_shape in ['NCTHW', 'NCTVM', 'MIX2d3d']:
+ elif self.format_shape in ['NCTHW', 'MIX2d3d']:
normalizer_shape = (-1, 1, 1, 1)
else:
raise ValueError(f'Invalid format shape: {format_shape}')
@@ -81,21 +75,21 @@ def forward(self,
training (bool): Whether to enable training time augmentation.
Returns:
- dict or Tuple[dict]: Data in the same format as the model
- input.
+ dict or Tuple[dict]: Data in the same format as the model input.
"""
+ data = self.cast_data(data)
if isinstance(data, dict):
- return self.forward_onesample(data, training)
+ return self.forward_onesample(data, training=training)
elif isinstance(data, tuple):
outputs = []
for data_sample in data:
- output = self.forward_onesample(data_sample, training)
+ output = self.forward_onesample(data_sample, training=training)
outputs.append(output)
return tuple(outputs)
else:
- raise TypeError('Unsupported data type for `data`!')
+ raise TypeError(f'Unsupported data type: {type(data)}!')
- def forward_onesample(self, data: dict, training: bool = False) -> dict:
+ def forward_onesample(self, data, training: bool = False) -> dict:
"""Perform normalization, padding, bgr2rgb conversion and batch
augmentation on one data sample.
@@ -107,12 +101,18 @@ def forward_onesample(self, data: dict, training: bool = False) -> dict:
dict: Data in the same format as the model
input.
"""
- data = self.cast_data(data)
inputs, data_samples = data['inputs'], data['data_samples']
+ inputs, data_samples = self.preprocess(inputs, data_samples, training)
+ data['inputs'] = inputs
+ data['data_samples'] = data_samples
+ return data
+ def preprocess(self,
+ inputs: List[torch.Tensor],
+ data_samples: SampleList,
+ training: bool = False) -> Tuple:
# --- Pad and stack --
- batch_inputs = stack_batch(inputs, self.pad_size_divisor,
- self.pad_value)
+ batch_inputs = stack_batch(inputs)
if self.format_shape == 'MIX2d3d':
if batch_inputs.ndim == 4:
@@ -147,5 +147,4 @@ def forward_onesample(self, data: dict, training: bool = False) -> dict:
batch_inputs, data_samples = self.blending(batch_inputs,
data_samples)
- data['inputs'] = batch_inputs
- return data
+ return batch_inputs, data_samples
diff --git a/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py b/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py
new file mode 100644
index 0000000000..1353c811d4
--- /dev/null
+++ b/mmaction/models/data_preprocessors/multimodal_data_preprocessor.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+from mmengine.model import BaseDataPreprocessor, ModuleDict
+
+from mmaction.registry import MODELS
+
+
+@MODELS.register_module()
+class MultiModalDataPreprocessor(BaseDataPreprocessor):
+ """Multi-Modal data pre-processor for action recognition tasks."""
+
+ def __init__(self, preprocessors: Dict) -> None:
+ super().__init__()
+ self.preprocessors = ModuleDict()
+ for name, pre_cfg in preprocessors.items():
+ assert 'type' in pre_cfg, (
+ 'Each data preprocessor should contain the key type, '
+ f'but got {pre_cfg}')
+ self.preprocessors[name] = MODELS.build(pre_cfg)
+
+ def forward(self, data: Dict, training: bool = False) -> Dict:
+ """Preprocesses the data into the model input format.
+
+ Args:
+ data (dict): Data returned by dataloader.
+ training (bool): Whether to enable training time augmentation.
+
+ Returns:
+ dict: Data in the same format as the model input.
+ """
+ data = self.cast_data(data)
+ inputs, data_samples = data['inputs'], data['data_samples']
+ for modality, modality_data in inputs.items():
+ preprocessor = self.preprocessors[modality]
+ modality_data, data_samples = preprocessor.preprocess(
+ modality_data, data_samples, training)
+ inputs[modality] = modality_data
+
+ data['inputs'] = inputs
+ data['data_samples'] = data_samples
+ return data
diff --git a/mmaction/models/heads/__init__.py b/mmaction/models/heads/__init__.py
index 964f7b45e4..5a1b74a9f8 100644
--- a/mmaction/models/heads/__init__.py
+++ b/mmaction/models/heads/__init__.py
@@ -4,6 +4,7 @@
from .i3d_head import I3DHead
from .mvit_head import MViTHead
from .omni_head import OmniHead
+from .rgbpose_head import RGBPoseHead
from .slowfast_head import SlowFastHead
from .timesformer_head import TimeSformerHead
from .tpn_head import TPNHead
@@ -11,10 +12,11 @@
from .tsm_head import TSMHead
from .tsn_audio_head import TSNAudioHead
from .tsn_head import TSNHead
+from .uniformer_head import UniFormerHead
from .x3d_head import X3DHead
__all__ = [
'BaseHead', 'GCNHead', 'I3DHead', 'MViTHead', 'OmniHead', 'SlowFastHead',
'TPNHead', 'TRNHead', 'TSMHead', 'TSNAudioHead', 'TSNHead',
- 'TimeSformerHead', 'X3DHead'
+ 'TimeSformerHead', 'UniFormerHead', 'RGBPoseHead', 'X3DHead'
]
diff --git a/mmaction/models/heads/base.py b/mmaction/models/heads/base.py
index 10ceae3dbb..8eafdc2cf2 100644
--- a/mmaction/models/heads/base.py
+++ b/mmaction/models/heads/base.py
@@ -1,18 +1,16 @@
# Copyright (c) OpenMMLab. All rights reserved.
from abc import ABCMeta, abstractmethod
-from typing import Tuple, Union
+from typing import Dict, Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmengine.model import BaseModule
from mmengine.structures import LabelData
-from torch import Tensor
from mmaction.evaluation import top_k_accuracy
from mmaction.registry import MODELS
-from mmaction.utils import (ConfigType, LabelList, OptConfigType,
- OptMultiConfig, SampleList)
+from mmaction.utils import ForwardResults, SampleList
class AvgConsensus(nn.Module):
@@ -20,14 +18,14 @@ class AvgConsensus(nn.Module):
Args:
dim (int): Decide which dim consensus function to apply.
- Default: 1.
+ Defaults to 1.
"""
def __init__(self, dim: int = 1) -> None:
super().__init__()
self.dim = dim
- def forward(self, x: Tensor) -> Tensor:
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Defines the computation performed at every call."""
return x.mean(dim=self.dim, keepdim=True)
@@ -37,35 +35,34 @@ class BaseHead(BaseModule, metaclass=ABCMeta):
All Head should subclass it.
All subclass should overwrite:
- - :meth:`init_weights`, initializing weights in some modules.
- :meth:`forward`, supporting to forward both for training and testing.
Args:
num_classes (int): Number of classes to be classified.
in_channels (int): Number of channels in input feature.
- loss_cls (dict or ConfigDict): Config for building loss.
- Default: dict(type='CrossEntropyLoss', loss_weight=1.0).
+ loss_cls (dict): Config for building loss.
+ Defaults to ``dict(type='CrossEntropyLoss', loss_weight=1.0)``.
multi_class (bool): Determines whether it is a multi-class
- recognition task. Default: False.
+ recognition task. Defaults to False.
label_smooth_eps (float): Epsilon used in label smooth.
- Reference: arxiv.org/abs/1906.02629. Default: 0.
- topk (int or tuple): Top-k accuracy. Default: (1, 5).
- average_clips (dict or ConfigDict, optional): Config for
- averaging class scores over multiple clips. Default: None.
- init_cfg (dict or ConfigDict, optional): Config to control the
- initialization. Defaults to None.
+ Reference: arxiv.org/abs/1906.02629. Defaults to 0.
+ topk (int or tuple): Top-k accuracy. Defaults to ``(1, 5)``.
+ average_clips (dict, optional): Config for averaging class
+ scores over multiple clips. Defaults to None.
+ init_cfg (dict, optional): Config to control the initialization.
+ Defaults to None.
"""
def __init__(self,
num_classes: int,
in_channels: int,
- loss_cls: ConfigType = dict(
+ loss_cls: Dict = dict(
type='CrossEntropyLoss', loss_weight=1.0),
multi_class: bool = False,
label_smooth_eps: float = 0.0,
topk: Union[int, Tuple[int]] = (1, 5),
- average_clips: OptConfigType = None,
- init_cfg: OptMultiConfig = None) -> None:
+ average_clips: Optional[Dict] = None,
+ init_cfg: Optional[Dict] = None) -> None:
super(BaseHead, self).__init__(init_cfg=init_cfg)
self.num_classes = num_classes
self.in_channels = in_channels
@@ -81,18 +78,19 @@ def __init__(self,
self.topk = topk
@abstractmethod
- def forward(self, x, **kwargs) -> Tensor:
+ def forward(self, x, **kwargs) -> ForwardResults:
"""Defines the computation performed at every call."""
raise NotImplementedError
- def loss(self, feats: Union[Tensor, Tuple[Tensor]],
- data_samples: SampleList, **kwargs) -> dict:
+ def loss(self, feats: Union[torch.Tensor, Tuple[torch.Tensor]],
+ data_samples: SampleList, **kwargs) -> Dict:
"""Perform forward propagation of head and loss calculation on the
features of the upstream network.
Args:
- feats (Tensor or Tuple[Tensor]): Features from upstream network.
- data_samples (List[:obj:`ActionDataSample`]): The batch
+ feats (torch.Tensor | tuple[torch.Tensor]): Features from
+ upstream network.
+ data_samples (list[:obj:`ActionDataSample`]): The batch
data samples.
Returns:
@@ -101,14 +99,14 @@ def loss(self, feats: Union[Tensor, Tuple[Tensor]],
cls_scores = self(feats, **kwargs)
return self.loss_by_feat(cls_scores, data_samples)
- def loss_by_feat(self, cls_scores: Union[Tensor, Tuple[Tensor]],
- data_samples: SampleList) -> dict:
+ def loss_by_feat(self, cls_scores: torch.Tensor,
+ data_samples: SampleList) -> Dict:
"""Calculate the loss based on the features extracted by the head.
Args:
- cls_scores (Tensor): Classification prediction results of
+ cls_scores (torch.Tensor): Classification prediction results of
all class, has shape (batch_size, num_classes).
- data_samples (List[:obj:`ActionDataSample`]): The batch
+ data_samples (list[:obj:`ActionDataSample`]): The batch
data samples.
Returns:
@@ -149,32 +147,33 @@ def loss_by_feat(self, cls_scores: Union[Tensor, Tuple[Tensor]],
losses['loss_cls'] = loss_cls
return losses
- def predict(self, feats: Union[Tensor, Tuple[Tensor]],
- data_samples: SampleList, **kwargs) -> LabelList:
+ def predict(self, feats: Union[torch.Tensor, Tuple[torch.Tensor]],
+ data_samples: SampleList, **kwargs) -> SampleList:
"""Perform forward propagation of head and predict recognition results
on the features of the upstream network.
Args:
- feats (Tensor or Tuple[Tensor]): Features from upstream network.
- data_samples (List[:obj:`ActionDataSample`]): The batch
+ feats (torch.Tensor | tuple[torch.Tensor]): Features from
+ upstream network.
+ data_samples (list[:obj:`ActionDataSample`]): The batch
data samples.
Returns:
- List[:obj:`ActionDataSample`]: Recognition results wrapped
+ list[:obj:`ActionDataSample`]: Recognition results wrapped
by :obj:`ActionDataSample`.
"""
cls_scores = self(feats, **kwargs)
return self.predict_by_feat(cls_scores, data_samples)
- def predict_by_feat(self, cls_scores: Tensor,
- data_samples: SampleList) -> LabelList:
+ def predict_by_feat(self, cls_scores: torch.Tensor,
+ data_samples: SampleList) -> SampleList:
"""Transform a batch of output features extracted from the head into
prediction results.
Args:
- cls_scores (Tensor): Classification scores, has a shape
- (num_classes, )
- data_samples (List[:obj:`ActionDataSample`]): The
+ cls_scores (torch.Tensor): Classification scores, has a shape
+ (B*num_segs, num_classes)
+ data_samples (list[:obj:`ActionDataSample`]): The
annotation data of every samples. It usually includes
information such as `gt_labels`.
@@ -186,15 +185,17 @@ def predict_by_feat(self, cls_scores: Tensor,
cls_scores = self.average_clip(cls_scores, num_segs=num_segs)
pred_labels = cls_scores.argmax(dim=-1, keepdim=True).detach()
- for data_sample, score, pred_lable in zip(data_samples, cls_scores,
+ for data_sample, score, pred_label in zip(data_samples, cls_scores,
pred_labels):
prediction = LabelData(item=score)
- pred_label = LabelData(item=pred_lable)
+ pred_label = LabelData(item=pred_label)
data_sample.pred_scores = prediction
data_sample.pred_labels = pred_label
return data_samples
- def average_clip(self, cls_scores: Tensor, num_segs: int = 1) -> Tensor:
+ def average_clip(self,
+ cls_scores: torch.Tensor,
+ num_segs: int = 1) -> torch.Tensor:
"""Averaging class scores over multiple clips.
Using different averaging types ('score' or 'prob' or None,
@@ -202,11 +203,11 @@ def average_clip(self, cls_scores: Tensor, num_segs: int = 1) -> Tensor:
class score. Only called in test mode.
Args:
- cls_scores (Tensor): Class scores to be averaged.
+ cls_scores (torch.Tensor): Class scores to be averaged.
num_segs (int): Number of clips for each input sample.
Returns:
- Tensor: Averaged class scores.
+ torch.Tensor: Averaged class scores.
"""
if self.average_clips not in ['score', 'prob', None]:
diff --git a/mmaction/models/heads/rgbpose_head.py b/mmaction/models/heads/rgbpose_head.py
new file mode 100644
index 0000000000..69da4efed9
--- /dev/null
+++ b/mmaction/models/heads/rgbpose_head.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model.weight_init import normal_init
+from mmengine.structures import LabelData
+
+from mmaction.evaluation import top_k_accuracy
+from mmaction.registry import MODELS
+from mmaction.utils import SampleList
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class RGBPoseHead(BaseHead):
+ """The classification head for RGBPoseConv3D.
+
+ Args:
+ num_classes (int): Number of classes to be classified.
+ in_channels (tuple[int]): Number of channels in input feature.
+ loss_cls (dict): Config for building loss.
+ Defaults to ``dict(type='CrossEntropyLoss')``.
+ loss_components (list[str]): The components of the loss.
+ Defaults to ``['rgb', 'pose']``.
+ loss_weights (float or tuple[float]): The weights of the losses.
+ Defaults to 1.
+ dropout (float): Probability of dropout layer. Default: 0.5.
+ init_std (float): Std value for Initiation. Default: 0.01.
+ """
+
+ def __init__(self,
+ num_classes: int,
+ in_channels: Tuple[int],
+ loss_cls: Dict = dict(type='CrossEntropyLoss'),
+ loss_components: List[str] = ['rgb', 'pose'],
+ loss_weights: Union[float, Tuple[float]] = 1.,
+ dropout: float = 0.5,
+ init_std: float = 0.01,
+ **kwargs) -> None:
+ super().__init__(num_classes, in_channels, loss_cls, **kwargs)
+ if isinstance(dropout, float):
+ dropout = {'rgb': dropout, 'pose': dropout}
+ assert isinstance(dropout, dict)
+
+ if loss_components is not None:
+ self.loss_components = loss_components
+ if isinstance(loss_weights, float):
+ loss_weights = [loss_weights] * len(loss_components)
+ assert len(loss_weights) == len(loss_components)
+ self.loss_weights = loss_weights
+
+ self.dropout = dropout
+ self.init_std = init_std
+
+ self.dropout_rgb = nn.Dropout(p=self.dropout['rgb'])
+ self.dropout_pose = nn.Dropout(p=self.dropout['pose'])
+
+ self.fc_rgb = nn.Linear(self.in_channels[0], num_classes)
+ self.fc_pose = nn.Linear(self.in_channels[1], num_classes)
+ self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
+
+ def init_weights(self) -> None:
+ """Initiate the parameters from scratch."""
+ normal_init(self.fc_rgb, std=self.init_std)
+ normal_init(self.fc_pose, std=self.init_std)
+
+ def forward(self, x: Tuple[torch.Tensor]) -> Dict:
+ """Defines the computation performed at every call."""
+ x_rgb, x_pose = self.avg_pool(x[0]), self.avg_pool(x[1])
+ x_rgb = x_rgb.view(x_rgb.size(0), -1)
+ x_pose = x_pose.view(x_pose.size(0), -1)
+
+ x_rgb = self.dropout_rgb(x_rgb)
+ x_pose = self.dropout_pose(x_pose)
+
+ cls_scores = dict()
+ cls_scores['rgb'] = self.fc_rgb(x_rgb)
+ cls_scores['pose'] = self.fc_pose(x_pose)
+
+ return cls_scores
+
+ def loss(self, feats: Tuple[torch.Tensor], data_samples: SampleList,
+ **kwargs) -> Dict:
+ """Perform forward propagation of head and loss calculation on the
+ features of the upstream network.
+
+ Args:
+ feats (tuple[torch.Tensor]): Features from upstream network.
+ data_samples (list[:obj:`ActionDataSample`]): The batch
+ data samples.
+
+ Returns:
+ dict: A dictionary of loss components.
+ """
+ cls_scores = self(feats, **kwargs)
+ return self.loss_by_feat(cls_scores, data_samples)
+
+ def loss_by_feat(self, cls_scores: Dict[str, torch.Tensor],
+ data_samples: SampleList) -> Dict:
+ """Calculate the loss based on the features extracted by the head.
+
+ Args:
+ cls_scores (dict[str, torch.Tensor]): The dict of
+ classification scores,
+ data_samples (list[:obj:`ActionDataSample`]): The batch
+ data samples.
+
+ Returns:
+ dict: A dictionary of loss components.
+ """
+ labels = torch.stack([x.gt_labels.item for x in data_samples])
+ labels = labels.squeeze()
+
+ if labels.shape == torch.Size([]):
+ labels = labels.unsqueeze(0)
+ elif labels.dim() == 1 and labels.size()[0] == self.num_classes \
+ and cls_scores.size()[0] == 1:
+ # Fix a bug when training with soft labels and batch size is 1.
+ # When using soft labels, `labels` and `cls_score` share the same
+ # shape.
+ labels = labels.unsqueeze(0)
+
+ losses = dict()
+ for loss_name, weight in zip(self.loss_components, self.loss_weights):
+ cls_score = cls_scores[loss_name]
+ loss_cls = self.loss_by_scores(cls_score, labels)
+ loss_cls = {loss_name + '_' + k: v for k, v in loss_cls.items()}
+ loss_cls[f'{loss_name}_loss_cls'] *= weight
+ losses.update(loss_cls)
+ return losses
+
+ def loss_by_scores(self, cls_scores: torch.Tensor,
+ labels: torch.Tensor) -> Dict:
+ """Calculate the loss based on the features extracted by the head.
+
+ Args:
+ cls_scores (torch.Tensor): Classification prediction
+ results of all class, has shape (batch_size, num_classes).
+ labels (torch.Tensor): The labels used to calculate the loss.
+
+ Returns:
+ dict: A dictionary of loss components.
+ """
+ losses = dict()
+ if cls_scores.size() != labels.size():
+ top_k_acc = top_k_accuracy(cls_scores.detach().cpu().numpy(),
+ labels.detach().cpu().numpy(),
+ self.topk)
+ for k, a in zip(self.topk, top_k_acc):
+ losses[f'top{k}_acc'] = torch.tensor(
+ a, device=cls_scores.device)
+ if self.label_smooth_eps != 0:
+ if cls_scores.size() != labels.size():
+ labels = F.one_hot(labels, num_classes=self.num_classes)
+ labels = ((1 - self.label_smooth_eps) * labels +
+ self.label_smooth_eps / self.num_classes)
+
+ loss_cls = self.loss_cls(cls_scores, labels)
+ # loss_cls may be dictionary or single tensor
+ if isinstance(loss_cls, dict):
+ losses.update(loss_cls)
+ else:
+ losses['loss_cls'] = loss_cls
+ return losses
+
+ def predict(self, feats: Tuple[torch.Tensor], data_samples: SampleList,
+ **kwargs) -> SampleList:
+ """Perform forward propagation of head and predict recognition results
+ on the features of the upstream network.
+
+ Args:
+ feats (tuple[torch.Tensor]): Features from upstream network.
+ data_samples (list[:obj:`ActionDataSample`]): The batch
+ data samples.
+
+ Returns:
+ list[:obj:`ActionDataSample`]: Recognition results wrapped
+ by :obj:`ActionDataSample`.
+ """
+ cls_scores = self(feats, **kwargs)
+ return self.predict_by_feat(cls_scores, data_samples)
+
+ def predict_by_feat(self, cls_scores: Dict[str, torch.Tensor],
+ data_samples: SampleList) -> SampleList:
+ """Transform a batch of output features extracted from the head into
+ prediction results.
+
+ Args:
+ cls_scores (dict[str, torch.Tensor]): The dict of
+ classification scores,
+ data_samples (list[:obj:`ActionDataSample`]): The
+ annotation data of every samples. It usually includes
+ information such as `gt_labels`.
+
+ Returns:
+ list[:obj:`ActionDataSample`]: Recognition results wrapped
+ by :obj:`ActionDataSample`.
+ """
+ pred_scores = [LabelData() for _ in range(len(data_samples))]
+ pred_labels = [LabelData() for _ in range(len(data_samples))]
+
+ for name in self.loss_components:
+ cls_score = cls_scores[name]
+ cls_score, pred_label = \
+ self.predict_by_scores(cls_score, data_samples)
+ for pred_score, pred_label, score, label in zip(
+ pred_scores, pred_labels, cls_score, pred_label):
+ pred_score.set_data({f'{name}': score})
+ pred_label.set_data({f'{name}': label})
+
+ for data_sample, pred_score, pred_label in zip(data_samples,
+ pred_scores,
+ pred_labels):
+ data_sample.pred_scores = pred_score
+ data_sample.pred_labels = pred_label
+
+ return data_samples
+
+ def predict_by_scores(self, cls_scores: torch.Tensor,
+ data_samples: SampleList) -> Tuple:
+ """Transform a batch of output features extracted from the head into
+ prediction results.
+
+ Args:
+ cls_scores (torch.Tensor): Classification scores, has a shape
+ (B*num_segs, num_classes)
+ data_samples (list[:obj:`ActionDataSample`]): The annotation
+ data of every samples.
+
+ Returns:
+ tuple: A tuple of the averaged classification scores and
+ prediction labels.
+ """
+
+ num_segs = cls_scores.shape[0] // len(data_samples)
+ cls_scores = self.average_clip(cls_scores, num_segs=num_segs)
+ pred_labels = cls_scores.argmax(dim=-1, keepdim=True).detach()
+ return cls_scores, pred_labels
diff --git a/mmaction/models/heads/uniformer_head.py b/mmaction/models/heads/uniformer_head.py
new file mode 100644
index 0000000000..e83b552b93
--- /dev/null
+++ b/mmaction/models/heads/uniformer_head.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmengine.fileio import load
+from mmengine.logging import MMLogger
+from mmengine.runner.checkpoint import _load_checkpoint_with_prefix
+from torch import Tensor, nn
+
+from mmaction.registry import MODELS
+from mmaction.utils import ConfigType
+from .base import BaseHead
+
+
+@MODELS.register_module()
+class UniFormerHead(BaseHead):
+ """Classification head for UniFormer. supports loading pretrained
+ Kinetics-710 checkpoint to fine-tuning on other Kinetics dataset.
+
+ A pytorch implement of: `UniFormerV2: Spatiotemporal
+ Learning by Arming Image ViTs with Video UniFormer
+ `
+
+ Args:
+ num_classes (int): Number of classes to be classified.
+ in_channels (int): Number of channels in input feature.
+ loss_cls (dict or ConfigDict): Config for building loss.
+ Defaults to `dict(type='CrossEntropyLoss')`.
+ dropout_ratio (float): Probability of dropout layer.
+ Defaults to : 0.0.
+ channel_map (str, optional): Channel map file to selecting
+ channels from pretrained head with extra channels.
+ Defaults to None.
+ init_cfg (dict or ConfigDict, optional): Config to control the
+ initialization. Defaults to
+ ``[
+ dict(type='TruncNormal', layer='Linear', std=0.01)
+ ]``.
+ kwargs (dict, optional): Any keyword argument to be used to initialize
+ the head.
+ """
+
+ def __init__(self,
+ num_classes: int,
+ in_channels: int,
+ loss_cls: ConfigType = dict(type='CrossEntropyLoss'),
+ dropout_ratio: float = 0.0,
+ channel_map: Optional[str] = None,
+ init_cfg: Optional[dict] = dict(
+ type='TruncNormal', layer='Linear', std=0.02),
+ **kwargs) -> None:
+ super().__init__(
+ num_classes, in_channels, loss_cls, init_cfg=init_cfg, **kwargs)
+ self.channel_map = channel_map
+ self.dropout_ratio = dropout_ratio
+
+ if self.dropout_ratio != 0:
+ self.dropout = nn.Dropout(p=self.dropout_ratio)
+ else:
+ self.dropout = None
+ self.fc_cls = nn.Linear(self.in_channels, self.num_classes)
+
+ def _select_channels(self, stact_dict):
+ selected_channels = load(self.channel_map)
+ for key in stact_dict:
+ stact_dict[key] = stact_dict[key][selected_channels]
+
+ def init_weights(self) -> None:
+ """Initiate the parameters from scratch."""
+ if self.init_cfg['type'] == 'Pretrained':
+ assert self.channel_map is not None, \
+ 'load cls_head weights needs to specify the channel map file'
+ logger = MMLogger.get_current_instance()
+ pretrained = self.init_cfg['checkpoint']
+ logger.info(f'load pretrained model from {pretrained}')
+ state_dict = _load_checkpoint_with_prefix(
+ 'cls_head.', pretrained, map_location='cpu')
+ self._select_channels(state_dict)
+ msg = self.load_state_dict(state_dict, strict=False)
+ logger.info(msg)
+ else:
+ super().init_weights()
+
+ def forward(self, x: Tensor, **kwargs) -> Tensor:
+ """Defines the computation performed at every call.
+
+ Args:
+ x (Tensor): The input data.
+
+ Returns:
+ Tensor: The classification scores for input samples.
+ """
+ # [N, in_channels]
+ if self.dropout is not None:
+ x = self.dropout(x)
+ # [N, in_channels]
+ cls_score = self.fc_cls(x)
+ # [N, num_classes]
+ return cls_score
diff --git a/mmaction/models/recognizers/__init__.py b/mmaction/models/recognizers/__init__.py
index 1b7db21451..447f6333dc 100644
--- a/mmaction/models/recognizers/__init__.py
+++ b/mmaction/models/recognizers/__init__.py
@@ -2,11 +2,12 @@
from .base import BaseRecognizer
from .recognizer2d import Recognizer2D
from .recognizer3d import Recognizer3D
+from .recognizer3d_mm import MMRecognizer3D
from .recognizer_audio import RecognizerAudio
from .recognizer_gcn import RecognizerGCN
from .recognizer_omni import RecognizerOmni
__all__ = [
'BaseRecognizer', 'RecognizerGCN', 'Recognizer2D', 'Recognizer3D',
- 'RecognizerAudio', 'RecognizerOmni'
+ 'RecognizerAudio', 'RecognizerOmni', 'MMRecognizer3D'
]
diff --git a/mmaction/models/recognizers/recognizer3d_mm.py b/mmaction/models/recognizers/recognizer3d_mm.py
new file mode 100644
index 0000000000..1d7099b3c3
--- /dev/null
+++ b/mmaction/models/recognizers/recognizer3d_mm.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+import torch
+
+from mmaction.registry import MODELS
+from mmaction.utils.typing import OptSampleList
+from .base import BaseRecognizer
+
+
+@MODELS.register_module()
+class MMRecognizer3D(BaseRecognizer):
+ """Multi-modal 3D recognizer model framework."""
+
+ def extract_feat(self,
+ inputs: Dict[str, torch.Tensor],
+ stage: str = 'backbone',
+ data_samples: OptSampleList = None,
+ test_mode: bool = False) -> Tuple:
+ """Extract features.
+
+ Args:
+ inputs (dict[str, torch.Tensor]): The multi-modal input data.
+ stage (str): Which stage to output the feature.
+ Defaults to ``'backbone'``.
+ data_samples (list[:obj:`ActionDataSample`], optional): Action data
+ samples, which are only needed in training. Defaults to None.
+ test_mode (bool): Whether in test mode. Defaults to False.
+
+ Returns:
+ tuple[torch.Tensor]: The extracted features.
+ dict: A dict recording the kwargs for downstream
+ pipeline.
+ """
+ # [N, num_views, C, T, H, W] ->
+ # [N * num_views, C, T, H, W]
+ for m, m_data in inputs.items():
+ m_data = m_data.reshape((-1, ) + m_data.shape[2:])
+ inputs[m] = m_data
+
+ # Record the kwargs required by `loss` and `predict`
+ loss_predict_kwargs = dict()
+
+ x = self.backbone(**inputs)
+ if stage == 'backbone':
+ return x, loss_predict_kwargs
+
+ if self.with_cls_head and stage == 'head':
+ x = self.cls_head(x, **loss_predict_kwargs)
+ return x, loss_predict_kwargs
diff --git a/mmaction/models/roi_heads/shared_heads/lfb.py b/mmaction/models/roi_heads/shared_heads/lfb.py
index e8e7afff2a..986c784403 100644
--- a/mmaction/models/roi_heads/shared_heads/lfb.py
+++ b/mmaction/models/roi_heads/shared_heads/lfb.py
@@ -4,7 +4,6 @@
import os.path as osp
import warnings
-import numpy as np
import torch
import torch.distributed as dist
from mmengine.dist import get_dist_info
@@ -130,6 +129,13 @@ def load_lfb(self, map_location):
osp.join(self.lfb_prefix_path, f'lfb_{dataset_mode}.pkl'))
print(f'Loading LFB from {lfb_path}...')
self.lfb.update(torch.load(lfb_path, map_location=map_location))
+
+ for video_id in self.lfb:
+ video_features = self.lfb[video_id]
+ for sec in video_features:
+ if isinstance(video_features[sec], (list, tuple)):
+ video_features[sec] = torch.stack(video_features[sec])
+ self.lfb[video_id] = video_features
print(f'LFB has been loaded on {map_location}.')
def load_lfb_on_lmdb(self):
@@ -162,22 +168,20 @@ def sample_long_term_features(self, video_id, timestamp):
# Sample long term features.
window_size, K = self.window_size, self.max_num_sampled_feat
start = timestamp - (window_size // 2)
- lt_feats = torch.zeros(window_size * K, self.lfb_channels)
+ lt_feats = torch.zeros(window_size, K, self.lfb_channels)
for idx, sec in enumerate(range(start, start + window_size)):
if sec in video_features:
# `num_feat` is the number of roi features in this second.
- num_feat = len(video_features[sec])
- num_feat_sampled = min(num_feat, K)
- # Sample some roi features randomly.
- random_lfb_indices = np.random.choice(
- range(num_feat), num_feat_sampled, replace=False)
+ feat = video_features[sec]
+ num_feat = feat.shape[0]
- for k, rand_idx in enumerate(random_lfb_indices):
- lt_feats[idx * K + k] = video_features[sec][rand_idx]
+ # Sample some roi features randomly.
+ random_lfb_indices = torch.randperm(num_feat)[:K]
+ lt_feats[idx, :num_feat] = feat[random_lfb_indices]
# [window_size * max_num_sampled_feat, lfb_channels]
- return lt_feats
+ return lt_feats.reshape(-1, self.lfb_channels)
def __getitem__(self, img_key):
"""Sample long term features like `lfb['0f39OWEqJ24,0902']` where `lfb`
diff --git a/mmaction/models/utils/blending_utils.py b/mmaction/models/utils/blending_utils.py
index 64808d32f7..babea75d05 100644
--- a/mmaction/models/utils/blending_utils.py
+++ b/mmaction/models/utils/blending_utils.py
@@ -1,11 +1,10 @@
# Copyright (c) OpenMMLab. All rights reserved.
from abc import ABCMeta, abstractmethod
-from typing import Union
+from typing import List, Optional, Tuple, Union
import numpy as np
import torch
import torch.nn.functional as F
-from torch import Tensor
from torch.distributions.beta import Beta
from mmaction.registry import MODELS
@@ -25,38 +24,39 @@ def __init__(self, num_classes: int) -> None:
self.num_classes = num_classes
@abstractmethod
- def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple:
+ def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+ **kwargs) -> Tuple:
"""Blending images process."""
raise NotImplementedError
- def __call__(self, imgs: Tensor, batch_data_samples: SampleList,
- **kwargs) -> tuple:
+ def __call__(self, imgs: torch.Tensor, batch_data_samples: SampleList,
+ **kwargs) -> Tuple:
"""Blending data in a mini-batch.
Images are float tensors with the shape of (B, N, C, H, W) for 2D
recognizers or (B, N, C, T, H, W) for 3D recognizers.
Besides, labels are converted from hard labels to soft labels.
- Hard labels are integer tensors with the shape of (B, 1) and all of the
+ Hard labels are integer tensors with the shape of (B, ) and all of the
elements are in the range [0, num_classes - 1].
- Soft labels (probablity distribution over classes) are float tensors
- with the shape of (B, 1, num_classes) and all of the elements are in
+ Soft labels (probability distribution over classes) are float tensors
+ with the shape of (B, num_classes) and all of the elements are in
the range [0, 1].
Args:
- imgs (Tensor): Model input images, float tensor with the
+ imgs (torch.Tensor): Model input images, float tensor with the
shape of (B, N, C, H, W) or (B, N, C, T, H, W).
batch_data_samples (List[:obj:`ActionDataSample`]): The batch
data samples. It usually includes information such
as `gt_labels`.
Returns:
- mixed_imgs (Tensor): Blending images, float tensor with the
+ mixed_imgs (torch.Tensor): Blending images, float tensor with the
same shape of the input imgs.
batch_data_samples (List[:obj:`ActionDataSample`]): The modified
batch data samples. ``gt_labels`` in each data sample are
converted from a hard label to a blended soft label, float
- tensor with the shape of (1, num_classes) and all elements are
+ tensor with the shape of (num_classes, ) and all elements are
in range [0, 1].
"""
label = [x.gt_labels.item for x in batch_data_samples]
@@ -90,13 +90,14 @@ def __init__(self, num_classes: int, alpha: float = .2) -> None:
super().__init__(num_classes=num_classes)
self.beta = Beta(alpha, alpha)
- def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple:
+ def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+ **kwargs) -> Tuple:
"""Blending images with mixup.
Args:
- imgs (Tensor): Model input images, float tensor with the
+ imgs (torch.Tensor): Model input images, float tensor with the
shape of (B, N, C, H, W) or (B, N, C, T, H, W).
- label (Tensor): One hot labels, integer tensor with the shape
+ label (torch.Tensor): One hot labels, integer tensor with the shape
of (B, num_classes).
Returns:
@@ -132,7 +133,7 @@ def __init__(self, num_classes: int, alpha: float = .2) -> None:
self.beta = Beta(alpha, alpha)
@staticmethod
- def rand_bbox(img_size: torch.Size, lam: Tensor) -> tuple:
+ def rand_bbox(img_size: torch.Size, lam: torch.Tensor) -> Tuple:
"""Generate a random boudning box."""
w = img_size[-1]
h = img_size[-2]
@@ -151,13 +152,14 @@ def rand_bbox(img_size: torch.Size, lam: Tensor) -> tuple:
return bbx1, bby1, bbx2, bby2
- def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple:
+ def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+ **kwargs) -> Tuple:
"""Blending images with cutmix.
Args:
- imgs (Tensor): Model input images, float tensor with the
+ imgs (torch.Tensor): Model input images, float tensor with the
shape of (B, N, C, H, W) or (B, N, C, T, H, W).
- label (Tensor): One hot labels, integer tensor with the shape
+ label (torch.Tensor): One hot labels, integer tensor with the shape
of (B, num_classes).
Returns:
@@ -209,7 +211,9 @@ class RandomBatchAugment(BaseMiniBatchBlending):
and to do nothing is 0.2.
"""
- def __init__(self, augments: Union[dict, list], probs=None):
+ def __init__(self,
+ augments: Union[dict, list],
+ probs: Optional[Union[float, List[float]]] = None) -> None:
if not isinstance(augments, (tuple, list)):
augments = [augments]
@@ -235,7 +239,8 @@ def __init__(self, augments: Union[dict, list], probs=None):
self.probs = probs
- def do_blending(self, imgs: Tensor, label: Tensor, **kwargs) -> tuple:
+ def do_blending(self, imgs: torch.Tensor, label: torch.Tensor,
+ **kwargs) -> Tuple:
"""Randomly apply batch augmentations to the batch inputs and batch
data samples."""
aug_index = np.random.choice(len(self.augments), p=self.probs)
diff --git a/mmaction/registry.py b/mmaction/registry.py
index 28d237daa8..6d7d831db1 100644
--- a/mmaction/registry.py
+++ b/mmaction/registry.py
@@ -9,6 +9,7 @@
from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
from mmengine.registry import DATASETS as MMENGINE_DATASETS
from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR
+from mmengine.registry import FUNCTIONS as MMENGINE_FUNCTION
from mmengine.registry import HOOKS as MMENGINE_HOOKS
from mmengine.registry import INFERENCERS as MMENGINE_INFERENCERS
from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS
@@ -127,3 +128,7 @@
'inferencer',
parent=MMENGINE_INFERENCERS,
locations=['mmaction.apis.inferencers'])
+
+# manage function
+FUNCTION = Registry(
+ 'function', parent=MMENGINE_FUNCTION, locations=['mmaction.mmengine'])
diff --git a/mmaction/structures/action_data_sample.py b/mmaction/structures/action_data_sample.py
index c75f6654a1..196b080136 100644
--- a/mmaction/structures/action_data_sample.py
+++ b/mmaction/structures/action_data_sample.py
@@ -1,25 +1,105 @@
# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Union
+from numbers import Number
+from typing import Sequence, Union
import numpy as np
import torch
from mmengine.structures import BaseDataElement, InstanceData, LabelData
+from mmengine.utils import is_str
+
+
+def format_label(value: Union[torch.Tensor, np.ndarray, Sequence,
+ int]) -> torch.Tensor:
+ """Convert various python types to label-format tensor.
+
+ Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+ :class:`Sequence`, :class:`int`.
+
+ Args:
+ value (torch.Tensor | numpy.ndarray | Sequence | int): Label value.
+
+ Returns:
+ :obj:`torch.Tensor`: The foramtted label tensor.
+ """
+
+ # Handle single number
+ if isinstance(value, (torch.Tensor, np.ndarray)) and value.ndim == 0:
+ value = int(value.item())
+
+ if isinstance(value, np.ndarray):
+ value = torch.from_numpy(value).to(torch.long)
+ elif isinstance(value, Sequence) and not is_str(value):
+ value = torch.tensor(value).to(torch.long)
+ elif isinstance(value, int):
+ value = torch.LongTensor([value])
+ elif not isinstance(value, torch.Tensor):
+ raise TypeError(f'Type {type(value)} is not an available label type.')
+ assert value.ndim == 1, \
+ f'The dims of value should be 1, but got {value.ndim}.'
+
+ return value
+
+
+def format_score(value: Union[torch.Tensor, np.ndarray,
+ Sequence]) -> torch.Tensor:
+ """Convert various python types to score-format tensor.
+
+ Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+ :class:`Sequence`.
+
+ Args:
+ value (torch.Tensor | numpy.ndarray | Sequence): Score values.
+
+ Returns:
+ :obj:`torch.Tensor`: The foramtted score tensor.
+ """
+
+ if isinstance(value, np.ndarray):
+ value = torch.from_numpy(value).float()
+ elif isinstance(value, Sequence) and not is_str(value):
+ value = torch.tensor(value).float()
+ elif not isinstance(value, torch.Tensor):
+ raise TypeError(f'Type {type(value)} is not an available label type.')
+ assert value.ndim == 1, \
+ f'The dims of value should be 1, but got {value.ndim}.'
+
+ return value
class ActionDataSample(BaseDataElement):
- def set_gt_labels(self, value: Union[int,
- np.ndarray]) -> 'ActionDataSample':
+ def set_gt_labels(
+ self, value: Union[np.ndarray, torch.Tensor, Sequence[Number], Number]
+ ) -> 'ActionDataSample':
"""Set label of ``gt_labels``."""
- if isinstance(value, int):
- value = torch.LongTensor([value])
- elif isinstance(value, np.ndarray):
- value = torch.from_numpy(value)
- else:
- raise TypeError(f'Type {type(value)} is not an '
- f'available label type.')
+ label_data = getattr(self, '_gt_label', LabelData())
+ label_data.item = format_label(value)
+ self.gt_labels = label_data
+ return self
- self.gt_labels = LabelData(item=value)
+ def set_pred_label(
+ self, value: Union[np.ndarray, torch.Tensor, Sequence[Number], Number]
+ ) -> 'ActionDataSample':
+ """Set label of ``pred_label``."""
+ label_data = getattr(self, '_pred_label', LabelData())
+ label_data.item = format_label(value)
+ self.pred_labels = label_data
+ return self
+
+ def set_pred_score(self, value: torch.Tensor) -> 'ActionDataSample':
+ """Set score of ``pred_label``."""
+ label_data = getattr(self, '_pred_label', LabelData())
+ label_data.item = format_score(value)
+ if hasattr(self, 'num_classes'):
+ assert len(label_data.item) == self.num_classes, \
+ f'The length of score {len(label_data.item)} should be '\
+ f'equal to the num_classes {self.num_classes}.'
+ else:
+ self.set_field(
+ name='num_classes',
+ value=len(label_data.item),
+ field_type='metainfo')
+ self.pred_scores = label_data
return self
@property
diff --git a/mmaction/utils/misc.py b/mmaction/utils/misc.py
index 3c34df3f68..bf4358a2f4 100644
--- a/mmaction/utils/misc.py
+++ b/mmaction/utils/misc.py
@@ -4,6 +4,7 @@
import os.path as osp
import random
import string
+from typing import Optional
import cv2
import mmcv
@@ -33,18 +34,23 @@ def get_shm_dir() -> str:
return '/dev/shm'
-def frame_extract(video_path: str, short_side: int):
+def frame_extract(video_path: str,
+ short_side: Optional[int] = None,
+ out_dir: str = './tmp'):
"""Extract frames given video_path.
Args:
video_path (str): The video path.
- short_side (int): The short-side of the image.
+ short_side (int): Target short-side of the output image.
+ Defaults to None, means keeping original shape.
+ out_dir (str): The output directory. Defaults to ``'./tmp'``.
"""
- # Load the video, extract frames into ./tmp/video_name
- target_dir = osp.join('./tmp', osp.basename(osp.splitext(video_path)[0]))
+ # Load the video, extract frames into OUT_DIR/video_name
+ target_dir = osp.join(out_dir, osp.basename(osp.splitext(video_path)[0]))
os.makedirs(target_dir, exist_ok=True)
# Should be able to handle videos up to several hours
frame_tmpl = osp.join(target_dir, 'img_{:06d}.jpg')
+ assert osp.exists(video_path), f'file not exit {video_path}'
vid = cv2.VideoCapture(video_path)
frames = []
frame_paths = []
@@ -52,11 +58,11 @@ def frame_extract(video_path: str, short_side: int):
cnt = 0
new_h, new_w = None, None
while flag:
- if new_h is None:
- h, w, _ = frame.shape
- new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf))
-
- frame = mmcv.imresize(frame, (new_w, new_h))
+ if short_side is not None:
+ if new_h is None:
+ h, w, _ = frame.shape
+ new_w, new_h = mmcv.rescale_size((w, h), (short_side, np.Inf))
+ frame = mmcv.imresize(frame, (new_w, new_h))
frames.append(frame)
frame_path = frame_tmpl.format(cnt + 1)
diff --git a/mmaction/version.py b/mmaction/version.py
index 5a0a756926..76d189b4d2 100644
--- a/mmaction/version.py
+++ b/mmaction/version.py
@@ -1,6 +1,6 @@
# Copyright (c) Open-MMLab. All rights reserved.
-__version__ = '1.0.0rc3'
+__version__ = '1.0.0'
def parse_version_info(version_str: str):
diff --git a/mmaction/visualization/action_visualizer.py b/mmaction/visualization/action_visualizer.py
index 48c595fd5b..6fc5ae2123 100644
--- a/mmaction/visualization/action_visualizer.py
+++ b/mmaction/visualization/action_visualizer.py
@@ -268,7 +268,10 @@ def add_datasample(self,
wait_time = frame_wait_time
else:
wait_time = wait_time
- self.show(drawn_img, win_name=frame_name, wait_time=wait_time)
+ self.show(
+ drawn_img[:, :, ::-1],
+ win_name=frame_name,
+ wait_time=wait_time)
resulted_video = np.array(resulted_video)
if out_path is not None:
diff --git a/model-index.yml b/model-index.yml
index a41addf98d..ebf462e3f9 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -15,7 +15,8 @@ Import:
- configs/recognition/trn/metafile.yml
- configs/recognition/swin/metafile.yml
- configs/recognition/c2d/metafile.yml
-- configs/detection/ava/metafile.yml
+- configs/detection/slowfast/metafile.yml
+- configs/detection/slowonly/metafile.yml
- configs/detection/acrn/metafile.yml
- configs/skeleton/stgcn/metafile.yml
- configs/skeleton/2s-agcn/metafile.yml
diff --git a/projects/README.md b/projects/README.md
new file mode 100644
index 0000000000..7e12abee97
--- /dev/null
+++ b/projects/README.md
@@ -0,0 +1,17 @@
+# Welcome to Projects of MMAction2
+
+In this folder, we welcome all contribution of deep-learning video understanding models from community.
+
+Here, these requirements, e.g., code standards, are not that strict as in the core package. Thus, developers from the community can implement their algorithms much more easily and efficiently in MMAction2. We appreciate all contributions from community to make MMAction2 greater.
+
+Here is an [example project](./example_project) about how to add your algorithms easily.
+
+We also provide some documentation listed below:
+
+- [Contribution Guide](https://mmaction2.readthedocs.io/en/dev-1.x/notes/contribution_guide.html)
+
+ The guides for new contributors about how to add your projects to MMAction2.
+
+- [Discussions](https://github.com/open-mmlab/mmaction2/discussions)
+
+ Welcome to start discussion!
diff --git a/projects/ctrgcn/README.md b/projects/ctrgcn/README.md
new file mode 100644
index 0000000000..b62bee8d86
--- /dev/null
+++ b/projects/ctrgcn/README.md
@@ -0,0 +1,143 @@
+# CTRGCN Project
+
+[Channel-wise Topology Refinement Graph Convolution for Skeleton-Based Action Recognition](https://arxiv.org/abs/2107.12213)
+
+
+
+## Abstract
+
+
+
+Graph convolutional networks (GCNs) have been widely used and achieved remarkable results in skeleton-based action recognition. In GCNs, graph topology dominates feature aggregation and therefore is the key to extracting representative features. In this work, we propose a novel Channel-wise Topology Refinement Graph Convolution (CTR-GC) to dynamically learn different topologies and effectively aggregate joint features in different channels for skeleton-based action recognition. The proposed CTR-GC models channel-wise topologies through learning a shared topology as a generic prior for all channels and refining it with channel-specific correlations for each channel. Our refinement method introduces few extra parameters and significantly reduces the difficulty of modeling channel-wise topologies. Furthermore, via reformulating graph convolutions into a unified form, we find that CTR-GC relaxes strict constraints of graph convolutions, leading to stronger representation capability. Combining CTR-GC with temporal modeling modules, we develop a powerful graph convolutional network named CTR-GCN which notably outperforms state-of-the-art methods on the NTU RGB+D, NTU RGB+D 120, and NW-UCLA datasets.
+
+
+
+
+
+
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started.html) to install MMAction2.
+
+At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md).
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher slurm \
+ --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher slurm \
+ --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+### NTU60_XSub_2D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+| uniform 100 | joint | 8 | CTRGCN | 89.6 | 10 clips | [config](./configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20230308-7aba454e.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) |
+
+### NTU60_XSub_3D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+| uniform 100 | joint | 8 | CTRGCN | 89.0 | 10 clips | [config](./configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20230308-950dca0a.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/ctrgcn/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) |
+
+## Citation
+
+
+
+```bibtex
+@inproceedings{chen2021channel,
+ title={Channel-wise topology refinement graph convolution for skeleton-based action recognition},
+ author={Chen, Yuxin and Zhang, Ziqi and Yuan, Chunfeng and Li, Bing and Deng, Ying and Hu, Weiming},
+ booktitle={CVPR},
+ pages={13359--13368},
+ year={2021}
+}
+```
+
+## Checklist
+
+Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects.
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+ - [x] Finish the code
+
+
+
+ - [x] Basic docstrings & proper citation
+
+
+
+ - [x] Converted checkpoint and results (Only for reproduction)
+
+
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+ - [x] Training results
+
+
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+ - [ ] Unit tests
+
+
+
+ - [ ] Code style
+
+
+
+ - [ ] `metafile.yml` and `README.md`
+
+
diff --git a/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
new file mode 100644
index 0000000000..4dd8629837
--- /dev/null
+++ b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+ type='RecognizerGCN',
+ backbone=dict(
+ type='CTRGCN', graph_cfg=dict(layout='coco', mode='spatial')),
+ cls_head=dict(type='GCNHead', num_classes=60, in_channels=256))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+train_pipeline = [
+ dict(type='PreNormalize2D'),
+ dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+ dict(type='UniformSampleFrames', clip_len=100),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+val_pipeline = [
+ dict(type='PreNormalize2D'),
+ dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+ dict(
+ type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+test_pipeline = [
+ dict(type='PreNormalize2D'),
+ dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+ dict(
+ type='UniformSampleFrames', clip_len=100, num_clips=10,
+ test_mode=True),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='RepeatDataset',
+ times=5,
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=train_pipeline,
+ split='xsub_train')))
+val_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=val_pipeline,
+ split='xsub_val',
+ test_mode=True))
+test_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=test_pipeline,
+ split='xsub_val',
+ test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0,
+ T_max=16,
+ by_epoch=True,
+ convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
new file mode 100644
index 0000000000..7ae499b4ce
--- /dev/null
+++ b/projects/ctrgcn/configs/ctrgcn_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+ type='RecognizerGCN',
+ backbone=dict(
+ type='CTRGCN', graph_cfg=dict(layout='nturgb+d', mode='spatial')),
+ cls_head=dict(type='GCNHead', num_classes=60, in_channels=256))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_3d.pkl'
+train_pipeline = [
+ dict(type='PreNormalize3D'),
+ dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+ dict(type='UniformSampleFrames', clip_len=100),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+val_pipeline = [
+ dict(type='PreNormalize3D'),
+ dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+ dict(
+ type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+test_pipeline = [
+ dict(type='PreNormalize3D'),
+ dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+ dict(
+ type='UniformSampleFrames', clip_len=100, num_clips=10,
+ test_mode=True),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='RepeatDataset',
+ times=5,
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=train_pipeline,
+ split='xsub_train')))
+val_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=val_pipeline,
+ split='xsub_val',
+ test_mode=True))
+test_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=test_pipeline,
+ split='xsub_val',
+ test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0,
+ T_max=16,
+ by_epoch=True,
+ convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/ctrgcn/models/__init__.py b/projects/ctrgcn/models/__init__.py
new file mode 100644
index 0000000000..71958fdd44
--- /dev/null
+++ b/projects/ctrgcn/models/__init__.py
@@ -0,0 +1,3 @@
+from .ctrgcn import CTRGCN
+
+__all__ = ['CTRGCN']
diff --git a/projects/ctrgcn/models/ctrgcn.py b/projects/ctrgcn/models/ctrgcn.py
new file mode 100644
index 0000000000..c6056071ea
--- /dev/null
+++ b/projects/ctrgcn/models/ctrgcn.py
@@ -0,0 +1,104 @@
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule, ModuleList
+
+from mmaction.models.utils import Graph, unit_tcn
+from mmaction.registry import MODELS
+from .ctrgcn_utils import MSTCN, unit_ctrgcn
+
+
+class CTRGCNBlock(BaseModule):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ A,
+ stride=1,
+ residual=True,
+ kernel_size=5,
+ dilations=[1, 2],
+ tcn_dropout=0):
+ super(CTRGCNBlock, self).__init__()
+ self.gcn1 = unit_ctrgcn(in_channels, out_channels, A)
+ self.tcn1 = MSTCN(
+ out_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ dilations=dilations,
+ residual=False,
+ tcn_dropout=tcn_dropout)
+ self.relu = nn.ReLU(inplace=True)
+ if not residual:
+ self.residual = lambda x: 0
+ elif (in_channels == out_channels) and (stride == 1):
+ self.residual = lambda x: x
+ else:
+ self.residual = unit_tcn(
+ in_channels, out_channels, kernel_size=1, stride=stride)
+
+ def forward(self, x):
+ y = self.relu(self.tcn1(self.gcn1(x)) + self.residual(x))
+ return y
+
+
+@MODELS.register_module()
+class CTRGCN(BaseModule):
+
+ def __init__(self,
+ graph_cfg,
+ in_channels=3,
+ base_channels=64,
+ num_stages=10,
+ inflate_stages=[5, 8],
+ down_stages=[5, 8],
+ pretrained=None,
+ num_person=2,
+ **kwargs):
+ super(CTRGCN, self).__init__()
+
+ self.graph = Graph(**graph_cfg)
+ A = torch.tensor(
+ self.graph.A, dtype=torch.float32, requires_grad=False)
+ self.register_buffer('A', A)
+
+ self.num_person = num_person
+ self.base_channels = base_channels
+
+ self.data_bn = nn.BatchNorm1d(num_person * in_channels * A.size(1))
+
+ kwargs0 = {k: v for k, v in kwargs.items() if k != 'tcn_dropout'}
+ modules = [
+ CTRGCNBlock(
+ in_channels,
+ base_channels,
+ A.clone(),
+ residual=False,
+ **kwargs0)
+ ]
+ for i in range(2, num_stages + 1):
+ in_channels = base_channels
+ out_channels = base_channels * (1 + (i in inflate_stages))
+ stride = 1 + (i in down_stages)
+ modules.append(
+ CTRGCNBlock(
+ base_channels,
+ out_channels,
+ A.clone(),
+ stride=stride,
+ **kwargs))
+ base_channels = out_channels
+ self.net = ModuleList(modules)
+
+ def forward(self, x):
+ N, M, T, V, C = x.size()
+ x = x.permute(0, 1, 3, 4, 2).contiguous()
+ x = self.data_bn(x.view(N, M * V * C, T))
+ x = x.view(N, M, V, C, T).permute(0, 1, 3, 4,
+ 2).contiguous().view(N * M, C, T, V)
+
+ for gcn in self.net:
+ x = gcn(x)
+
+ x = x.reshape((N, M) + x.shape[1:])
+ return x
diff --git a/projects/ctrgcn/models/ctrgcn_utils.py b/projects/ctrgcn/models/ctrgcn_utils.py
new file mode 100644
index 0000000000..52665e8567
--- /dev/null
+++ b/projects/ctrgcn/models/ctrgcn_utils.py
@@ -0,0 +1,192 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+
+from mmaction.models.utils import unit_tcn
+
+
+# ! Notice: The implementation of MSTCN in
+# MS-G3D is not the same as our implementation.
+class MSTCN(BaseModule):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ dilations=[1, 2, 3, 4],
+ residual=True,
+ act_cfg=dict(type='ReLU'),
+ init_cfg=[
+ dict(type='Constant', layer='BatchNorm2d', val=1),
+ dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+ ],
+ tcn_dropout=0):
+
+ super().__init__(init_cfg=init_cfg)
+ # Multiple branches of temporal convolution
+ self.num_branches = len(dilations) + 2
+ branch_channels = out_channels // self.num_branches
+ branch_channels_rem = out_channels - branch_channels * (
+ self.num_branches - 1)
+
+ if type(kernel_size) == list:
+ assert len(kernel_size) == len(dilations)
+ else:
+ kernel_size = [kernel_size] * len(dilations)
+
+ self.branches = ModuleList([
+ Sequential(
+ nn.Conv2d(
+ in_channels, branch_channels, kernel_size=1, padding=0),
+ nn.BatchNorm2d(branch_channels),
+ build_activation_layer(act_cfg),
+ unit_tcn(
+ branch_channels,
+ branch_channels,
+ kernel_size=ks,
+ stride=stride,
+ dilation=dilation),
+ ) for ks, dilation in zip(kernel_size, dilations)
+ ])
+
+ # Additional Max & 1x1 branch
+ self.branches.append(
+ Sequential(
+ nn.Conv2d(
+ in_channels, branch_channels, kernel_size=1, padding=0),
+ nn.BatchNorm2d(branch_channels),
+ build_activation_layer(act_cfg),
+ nn.MaxPool2d(
+ kernel_size=(3, 1), stride=(stride, 1), padding=(1, 0)),
+ nn.BatchNorm2d(branch_channels)))
+
+ self.branches.append(
+ Sequential(
+ nn.Conv2d(
+ in_channels,
+ branch_channels_rem,
+ kernel_size=1,
+ padding=0,
+ stride=(stride, 1)), nn.BatchNorm2d(branch_channels_rem)))
+
+ # Residual connection
+ if not residual:
+ self.residual = lambda x: 0
+ elif (in_channels == out_channels) and (stride == 1):
+ self.residual = lambda x: x
+ else:
+ self.residual = unit_tcn(
+ in_channels, out_channels, kernel_size=1, stride=stride)
+
+ self.act = build_activation_layer(act_cfg)
+ self.drop = nn.Dropout(tcn_dropout)
+
+ def forward(self, x):
+ # Input dim: (N,C,T,V)
+ res = self.residual(x)
+ branch_outs = []
+ for tempconv in self.branches:
+ out = tempconv(x)
+ branch_outs.append(out)
+
+ out = torch.cat(branch_outs, dim=1)
+ out += res
+ out = self.act(out)
+ out = self.drop(out)
+ return out
+
+
+class CTRGC(BaseModule):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ rel_reduction=8,
+ init_cfg=[
+ dict(type='Constant', layer='BatchNorm2d', val=1),
+ dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+ ]):
+ super(CTRGC, self).__init__(init_cfg=init_cfg)
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ if in_channels <= 16:
+ self.rel_channels = 8
+ else:
+ self.rel_channels = in_channels // rel_reduction
+ self.conv1 = nn.Conv2d(
+ self.in_channels, self.rel_channels, kernel_size=1)
+ self.conv2 = nn.Conv2d(
+ self.in_channels, self.rel_channels, kernel_size=1)
+ self.conv3 = nn.Conv2d(
+ self.in_channels, self.out_channels, kernel_size=1)
+ self.conv4 = nn.Conv2d(
+ self.rel_channels, self.out_channels, kernel_size=1)
+ self.tanh = nn.Tanh()
+
+ def forward(self, x, A=None, alpha=1):
+ # Input: N, C, T, V
+ x1, x2, x3 = self.conv1(x).mean(-2), self.conv2(x).mean(
+ -2), self.conv3(x)
+ # X1, X2: N, R, V
+ # N, R, V, 1 - N, R, 1, V
+ x1 = self.tanh(x1.unsqueeze(-1) - x2.unsqueeze(-2))
+ # N, R, V, V
+ x1 = self.conv4(x1) * alpha + (A[None, None] if A is not None else 0
+ ) # N,C,V,V
+ x1 = torch.einsum('ncuv,nctu->nctv', x1, x3)
+ return x1
+
+
+class unit_ctrgcn(BaseModule):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ A,
+ init_cfg=[
+ dict(
+ type='Constant',
+ layer='BatchNorm2d',
+ val=1,
+ override=dict(type='Constant', name='bn', val=1e-6)),
+ dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+ ]):
+
+ super(unit_ctrgcn, self).__init__(init_cfg=init_cfg)
+ inter_channels = out_channels // 4
+ self.inter_c = inter_channels
+ self.out_c = out_channels
+ self.in_c = in_channels
+
+ self.num_subset = A.shape[0]
+ self.convs = ModuleList()
+
+ for i in range(self.num_subset):
+ self.convs.append(CTRGC(in_channels, out_channels))
+
+ if in_channels != out_channels:
+ self.down = Sequential(
+ nn.Conv2d(in_channels, out_channels, 1),
+ nn.BatchNorm2d(out_channels))
+ else:
+ self.down = lambda x: x
+
+ self.A = nn.Parameter(A.clone())
+
+ self.alpha = nn.Parameter(torch.zeros(1))
+ self.bn = nn.BatchNorm2d(out_channels)
+ self.soft = nn.Softmax(-2)
+ self.relu = nn.ReLU(inplace=True)
+
+ def forward(self, x):
+ y = None
+
+ for i in range(self.num_subset):
+ z = self.convs[i](x, self.A[i], self.alpha)
+ y = z + y if y is not None else z
+
+ y = self.bn(y)
+ y += self.down(x)
+ return self.relu(y)
diff --git a/projects/example_project/README.md b/projects/example_project/README.md
new file mode 100644
index 0000000000..30cadfb8ed
--- /dev/null
+++ b/projects/example_project/README.md
@@ -0,0 +1,122 @@
+# Example Project
+
+This is an example README for community `projects/`. You can write your README in your own project. Here are
+some recommended parts of a README for others to understand and use your project, you can copy or modify them
+according to your project.
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started.html) to install MMAction2.
+
+At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the Kinetics400 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/kinetics/README.md).
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --launcher slurm \
+ --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py --checkpoint $CHECKPOINT --launcher slurm \
+ --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc | top5 acc | testing protocol | config | ckpt | log |
+| :---------------------: | :--------: | :--: | :------: | :------: | :------: | :------: | :----------------: | :-----------------------------------------------------------------------------: | ----------------: | --------------: |
+| 1x1x3 | 224x224 | 8 | ResNet50 | ImageNet | 72.83 | 90.65 | 25 clips x 10 crop | [config](./configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py) | [ckpt](CKPT-LINK) | [log](LOG-LINK) |
+
+## Citation
+
+
+
+```bibtex
+@misc{2020mmaction2,
+ title={OpenMMLab's Next Generation Video Understanding Toolbox and Benchmark},
+ author={MMAction2 Contributors},
+ howpublished = {\url{https://github.com/open-mmlab/mmaction2}},
+ year={2020}
+}
+```
+
+## Checklist
+
+Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects.
+
+- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+ - [ ] Finish the code
+
+
+
+ - [ ] Basic docstrings & proper citation
+
+
+
+ - [ ] Converted checkpoint and results (Only for reproduction)
+
+
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+ - [ ] Training results
+
+
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+ - [ ] Unit tests
+
+
+
+ - [ ] Code style
+
+
+
+ - [ ] `metafile.yml` and `README.md`
+
+
diff --git a/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py b/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py
new file mode 100644
index 0000000000..32ea002651
--- /dev/null
+++ b/projects/example_project/configs/examplenet_r50-in1k-pre_8xb32-1x1x3-100e_kinetics400-rgb.py
@@ -0,0 +1,11 @@
+# Directly inherit the entire recipe you want to use.
+_base_ = 'mmaction::recognition/tsn/' \
+ 'tsn_imagenet-pretrained-r50_8xb32-1x1x3-100e_kinetics400-rgb.py'
+
+# This line is to import your own modules.
+custom_imports = dict(imports='models')
+
+# Modify the backbone to use your own backbone.
+_base_['model']['backbone'] = dict(type='ExampleNet', depth=50)
+# Modify the in_channels of classifier head to fit your backbone.
+_base_['model']['cls_head']['in_channels'] = 2048
diff --git a/projects/example_project/models/__init__.py b/projects/example_project/models/__init__.py
new file mode 100644
index 0000000000..e2d4f2f571
--- /dev/null
+++ b/projects/example_project/models/__init__.py
@@ -0,0 +1,3 @@
+from .example_net import ExampleNet
+
+__all__ = ['ExampleNet']
diff --git a/projects/example_project/models/example_net.py b/projects/example_project/models/example_net.py
new file mode 100644
index 0000000000..6a3b8bbb06
--- /dev/null
+++ b/projects/example_project/models/example_net.py
@@ -0,0 +1,21 @@
+from mmaction.models import ResNet
+from mmaction.registry import MODELS
+
+
+# Register your model to the `MODELS`.
+@MODELS.register_module()
+class ExampleNet(ResNet):
+ """Implements an example backbone.
+
+ Implement the backbone network just like a normal pytorch network.
+ """
+
+ def __init__(self, **kwargs) -> None:
+ print('#############################\n'
+ '# Hello MMAction2! #\n'
+ '#############################')
+ super().__init__(**kwargs)
+
+ def forward(self, x):
+ """Defines the computation performed at every call."""
+ return super().forward(x)
diff --git a/projects/msg3d/README.md b/projects/msg3d/README.md
new file mode 100644
index 0000000000..a46c800acc
--- /dev/null
+++ b/projects/msg3d/README.md
@@ -0,0 +1,143 @@
+# MSG3D Project
+
+[Disentangling and Unifying Graph Convolutions for Skeleton-Based Action Recognition](https://arxiv.org/abs/2003.14111)
+
+
+
+## Abstract
+
+
+
+Spatial-temporal graphs have been widely used by skeleton-based action recognition algorithms to model human action dynamics. To capture robust movement patterns from these graphs, long-range and multi-scale context aggregation and spatial-temporal dependency modeling are critical aspects of a powerful feature extractor. However, existing methods have limitations in achieving (1) unbiased long-range joint relationship modeling under multi-scale operators and (2) unobstructed cross-spacetime information flow for capturing complex spatial-temporal dependencies. In this work, we present (1) a simple method to disentangle multi-scale graph convolutions and (2) a unified spatial-temporal graph convolutional operator named G3D. The proposed multi-scale aggregation scheme disentangles the importance of nodes in different neighborhoods for effective long-range modeling. The proposed G3D module leverages dense cross-spacetime edges as skip connections for direct information propagation across the spatial-temporal graph. By coupling these proposals, we develop a powerful feature extractor named MS-G3D based on which our model outperforms previous state-of-the-art methods on three large-scale datasets: NTU RGB+D 60, NTU RGB+D 120, and Kinetics Skeleton 400.
+
+
+
+
+
+
+
+## Usage
+
+### Setup Environment
+
+Please refer to [Get Started](https://mmaction2.readthedocs.io/en/latest/get_started.html) to install MMAction2.
+
+At first, add the current folder to `PYTHONPATH`, so that Python can find your code. Run command in the current directory to add it.
+
+> Please run it every time after you opened a new shell.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Data Preparation
+
+Prepare the NTU60 dataset according to the [instruction](https://github.com/open-mmlab/mmaction2/blob/1.x/tools/data/skeleton/README.md).
+
+### Training commands
+
+**To train with single GPU:**
+
+```bash
+mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
+```
+
+**To train with multiple GPUs:**
+
+```bash
+mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher pytorch --gpus 8
+```
+
+**To train with multiple GPUs by slurm:**
+
+```bash
+mim train mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --launcher slurm \
+ --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+### Testing commands
+
+**To test with single GPU:**
+
+```bash
+mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT
+```
+
+**To test with multiple GPUs:**
+
+```bash
+mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher pytorch --gpus 8
+```
+
+**To test with multiple GPUs by slurm:**
+
+```bash
+mim test mmaction configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py --checkpoint $CHECKPOINT --launcher slurm \
+ --gpus 8 --gpus-per-node 8 --partition $PARTITION
+```
+
+## Results
+
+### NTU60_XSub_2D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+| uniform 100 | joint | 8 | MSG3D | 92.3 | 10 clips | [config](./configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d_20230309-73b97296.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.log) |
+
+### NTU60_XSub_3D
+
+| frame sampling strategy | modality | gpus | backbone | top1 acc | testing protocol | config | ckpt | log |
+| :---------------------: | :------: | :--: | :------: | :------: | :--------------: | :--------------------------------------------: | :------------------------------------------: | :-----------------------------------------: |
+| uniform 100 | joint | 8 | MSG3D | 89.6 | 10 clips | [config](./configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d_20230308-c325d222.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/projects/msg3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.log) |
+
+## Citation
+
+
+
+```bibtex
+@inproceedings{liu2020disentangling,
+ title={Disentangling and unifying graph convolutions for skeleton-based action recognition},
+ author={Liu, Ziyu and Zhang, Hongwen and Chen, Zhenghao and Wang, Zhiyong and Ouyang, Wanli},
+ booktitle={CVPR},
+ pages={143--152},
+ year={2020}
+}
+```
+
+## Checklist
+
+Here is a checklist of this project's progress, and you can ignore this part if you don't plan to contribute to MMAction2 projects.
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+ - [x] Finish the code
+
+
+
+ - [x] Basic docstrings & proper citation
+
+
+
+ - [x] Converted checkpoint and results (Only for reproduction)
+
+
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+ - [x] Training results
+
+
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+ - [ ] Unit tests
+
+
+
+ - [ ] Code style
+
+
+
+ - [ ] `metafile.yml` and `README.md`
+
+
diff --git a/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
new file mode 100644
index 0000000000..ece30dc019
--- /dev/null
+++ b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-2d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+ type='RecognizerGCN',
+ backbone=dict(
+ type='MSG3D', graph_cfg=dict(layout='coco', mode='binary_adj')),
+ cls_head=dict(type='GCNHead', num_classes=60, in_channels=384))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_2d.pkl'
+train_pipeline = [
+ dict(type='PreNormalize2D'),
+ dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+ dict(type='UniformSampleFrames', clip_len=100),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+val_pipeline = [
+ dict(type='PreNormalize2D'),
+ dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+ dict(
+ type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+test_pipeline = [
+ dict(type='PreNormalize2D'),
+ dict(type='GenSkeFeat', dataset='coco', feats=['j']),
+ dict(
+ type='UniformSampleFrames', clip_len=100, num_clips=10,
+ test_mode=True),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='RepeatDataset',
+ times=5,
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=train_pipeline,
+ split='xsub_train')))
+val_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=val_pipeline,
+ split='xsub_val',
+ test_mode=True))
+test_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=test_pipeline,
+ split='xsub_val',
+ test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0,
+ T_max=16,
+ by_epoch=True,
+ convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
new file mode 100644
index 0000000000..290fda984d
--- /dev/null
+++ b/projects/msg3d/configs/msg3d_8xb16-joint-u100-80e_ntu60-xsub-keypoint-3d.py
@@ -0,0 +1,104 @@
+_base_ = 'mmaction::_base_/default_runtime.py'
+
+custom_imports = dict(imports='models')
+
+model = dict(
+ type='RecognizerGCN',
+ backbone=dict(
+ type='MSG3D', graph_cfg=dict(layout='nturgb+d', mode='binary_adj')),
+ cls_head=dict(type='GCNHead', num_classes=60, in_channels=384))
+
+dataset_type = 'PoseDataset'
+ann_file = 'data/skeleton/ntu60_3d.pkl'
+train_pipeline = [
+ dict(type='PreNormalize3D'),
+ dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+ dict(type='UniformSampleFrames', clip_len=100),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+val_pipeline = [
+ dict(type='PreNormalize3D'),
+ dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+ dict(
+ type='UniformSampleFrames', clip_len=100, num_clips=1, test_mode=True),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+test_pipeline = [
+ dict(type='PreNormalize3D'),
+ dict(type='GenSkeFeat', dataset='nturgb+d', feats=['j']),
+ dict(
+ type='UniformSampleFrames', clip_len=100, num_clips=10,
+ test_mode=True),
+ dict(type='PoseDecode'),
+ dict(type='FormatGCNInput', num_person=2),
+ dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=True),
+ dataset=dict(
+ type='RepeatDataset',
+ times=5,
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=train_pipeline,
+ split='xsub_train')))
+val_dataloader = dict(
+ batch_size=16,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=val_pipeline,
+ split='xsub_val',
+ test_mode=True))
+test_dataloader = dict(
+ batch_size=1,
+ num_workers=2,
+ persistent_workers=True,
+ sampler=dict(type='DefaultSampler', shuffle=False),
+ dataset=dict(
+ type=dataset_type,
+ ann_file=ann_file,
+ pipeline=test_pipeline,
+ split='xsub_val',
+ test_mode=True))
+
+val_evaluator = [dict(type='AccMetric')]
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+ type='EpochBasedTrainLoop', max_epochs=16, val_begin=1, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+param_scheduler = [
+ dict(
+ type='CosineAnnealingLR',
+ eta_min=0,
+ T_max=16,
+ by_epoch=True,
+ convert_to_iter_based=True)
+]
+
+optim_wrapper = dict(
+ optimizer=dict(
+ type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0005, nesterov=True))
+
+default_hooks = dict(checkpoint=dict(interval=1), logger=dict(interval=100))
+
+# Default setting for scaling LR automatically
+# - `enable` means enable scaling LR automatically
+# or not by default.
+# - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
diff --git a/projects/msg3d/models/__init__.py b/projects/msg3d/models/__init__.py
new file mode 100644
index 0000000000..82b4a3085c
--- /dev/null
+++ b/projects/msg3d/models/__init__.py
@@ -0,0 +1,3 @@
+from .msg3d import MSG3D
+
+__all__ = ['MSG3D']
diff --git a/projects/msg3d/models/msg3d.py b/projects/msg3d/models/msg3d.py
new file mode 100644
index 0000000000..e4124a3435
--- /dev/null
+++ b/projects/msg3d/models/msg3d.py
@@ -0,0 +1,75 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule, Sequential
+
+from mmaction.models.utils import Graph
+from mmaction.registry import MODELS
+from .msg3d_utils import MSGCN, MSTCN, MW_MSG3DBlock
+
+
+@MODELS.register_module()
+class MSG3D(BaseModule):
+
+ def __init__(self,
+ graph_cfg,
+ in_channels=3,
+ base_channels=96,
+ num_gcn_scales=13,
+ num_g3d_scales=6,
+ num_person=2,
+ tcn_dropout=0):
+ super().__init__()
+
+ self.graph = Graph(**graph_cfg)
+ # Note that A is a 2D tensor
+ A = torch.tensor(
+ self.graph.A[0], dtype=torch.float32, requires_grad=False)
+ self.register_buffer('A', A)
+ self.num_point = A.shape[-1]
+ self.in_channels = in_channels
+ self.base_channels = base_channels
+
+ self.data_bn = nn.BatchNorm1d(self.num_point * in_channels *
+ num_person)
+ c1, c2, c3 = base_channels, base_channels * 2, base_channels * 4
+
+ # r=3 STGC blocks
+ self.gcn3d1 = MW_MSG3DBlock(3, c1, A, num_g3d_scales, window_stride=1)
+ self.sgcn1 = Sequential(
+ MSGCN(num_gcn_scales, 3, c1, A), MSTCN(c1, c1), MSTCN(c1, c1))
+ self.sgcn1[-1].act = nn.Identity()
+ self.tcn1 = MSTCN(c1, c1, tcn_dropout=tcn_dropout)
+
+ self.gcn3d2 = MW_MSG3DBlock(c1, c2, A, num_g3d_scales, window_stride=2)
+ self.sgcn2 = Sequential(
+ MSGCN(num_gcn_scales, c1, c1, A), MSTCN(c1, c2, stride=2),
+ MSTCN(c2, c2))
+ self.sgcn2[-1].act = nn.Identity()
+ self.tcn2 = MSTCN(c2, c2, tcn_dropout=tcn_dropout)
+
+ self.gcn3d3 = MW_MSG3DBlock(c2, c3, A, num_g3d_scales, window_stride=2)
+ self.sgcn3 = Sequential(
+ MSGCN(num_gcn_scales, c2, c2, A), MSTCN(c2, c3, stride=2),
+ MSTCN(c3, c3))
+ self.sgcn3[-1].act = nn.Identity()
+ self.tcn3 = MSTCN(c3, c3, tcn_dropout=tcn_dropout)
+
+ def forward(self, x):
+ N, M, T, V, C = x.size()
+ x = x.permute(0, 1, 3, 4, 2).contiguous().reshape(N, M * V * C, T)
+ x = self.data_bn(x)
+ x = x.reshape(N * M, V, C, T).permute(0, 2, 3, 1).contiguous()
+
+ # Apply activation to the sum of the pathways
+ x = F.relu(self.sgcn1(x) + self.gcn3d1(x), inplace=True)
+ x = self.tcn1(x)
+
+ x = F.relu(self.sgcn2(x) + self.gcn3d2(x), inplace=True)
+ x = self.tcn2(x)
+
+ x = F.relu(self.sgcn3(x) + self.gcn3d3(x), inplace=True)
+ x = self.tcn3(x)
+
+ # N * M, C, T, V
+ return x.reshape((N, M) + x.shape[1:])
diff --git a/projects/msg3d/models/msg3d_utils.py b/projects/msg3d/models/msg3d_utils.py
new file mode 100644
index 0000000000..25b4f953b6
--- /dev/null
+++ b/projects/msg3d/models/msg3d_utils.py
@@ -0,0 +1,342 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+
+from mmaction.models.utils import unit_tcn
+from mmaction.models.utils.graph import k_adjacency, normalize_digraph
+
+
+class MLP(BaseModule):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ act_cfg=dict(type='ReLU'),
+ dropout=0):
+ super().__init__()
+ channels = [in_channels] + out_channels
+ self.layers = ModuleList()
+ for i in range(1, len(channels)):
+ if dropout > 1e-3:
+ self.layers.append(nn.Dropout(p=dropout))
+ self.layers.append(
+ nn.Conv2d(channels[i - 1], channels[i], kernel_size=1))
+ self.layers.append(nn.BatchNorm2d(channels[i]))
+ if act_cfg:
+ self.layers.append(build_activation_layer(act_cfg))
+
+ def forward(self, x):
+ for layer in self.layers:
+ x = layer(x)
+ return x
+
+
+class MSGCN(BaseModule):
+
+ def __init__(self,
+ num_scales,
+ in_channels,
+ out_channels,
+ A,
+ dropout=0,
+ act_cfg=dict(type='ReLU')):
+ super().__init__()
+ self.num_scales = num_scales
+
+ A_powers = [
+ k_adjacency(A, k, with_self=True) for k in range(num_scales)
+ ]
+ A_powers = np.stack([normalize_digraph(g) for g in A_powers])
+
+ # K, V, V
+ self.register_buffer('A', torch.Tensor(A_powers))
+ self.PA = nn.Parameter(self.A.clone())
+ nn.init.uniform_(self.PA, -1e-6, 1e-6)
+
+ self.mlp = MLP(
+ in_channels * num_scales, [out_channels],
+ dropout=dropout,
+ act_cfg=act_cfg)
+
+ def forward(self, x):
+ N, C, T, V = x.shape
+ A = self.A
+ A = A + self.PA
+
+ support = torch.einsum('kvu,nctv->nkctu', A, x)
+ support = support.reshape(N, self.num_scales * C, T, V)
+ out = self.mlp(support)
+ return out
+
+
+# ! Notice: The implementation of MSTCN in
+# MS-G3D is not the same as our implementation.
+class MSTCN(BaseModule):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ dilations=[1, 2, 3, 4],
+ residual=True,
+ act_cfg=dict(type='ReLU'),
+ init_cfg=[
+ dict(type='Constant', layer='BatchNorm2d', val=1),
+ dict(type='Kaiming', layer='Conv2d', mode='fan_out')
+ ],
+ tcn_dropout=0):
+
+ super().__init__(init_cfg=init_cfg)
+ # Multiple branches of temporal convolution
+ self.num_branches = len(dilations) + 2
+ branch_channels = out_channels // self.num_branches
+ branch_channels_rem = out_channels - branch_channels * (
+ self.num_branches - 1)
+
+ if type(kernel_size) == list:
+ assert len(kernel_size) == len(dilations)
+ else:
+ kernel_size = [kernel_size] * len(dilations)
+
+ self.branches = ModuleList([
+ Sequential(
+ nn.Conv2d(
+ in_channels, branch_channels, kernel_size=1, padding=0),
+ nn.BatchNorm2d(branch_channels),
+ build_activation_layer(act_cfg),
+ unit_tcn(
+ branch_channels,
+ branch_channels,
+ kernel_size=ks,
+ stride=stride,
+ dilation=dilation),
+ ) for ks, dilation in zip(kernel_size, dilations)
+ ])
+
+ # Additional Max & 1x1 branch
+ self.branches.append(
+ Sequential(
+ nn.Conv2d(
+ in_channels, branch_channels, kernel_size=1, padding=0),
+ nn.BatchNorm2d(branch_channels),
+ build_activation_layer(act_cfg),
+ nn.MaxPool2d(
+ kernel_size=(3, 1), stride=(stride, 1), padding=(1, 0)),
+ nn.BatchNorm2d(branch_channels)))
+
+ self.branches.append(
+ Sequential(
+ nn.Conv2d(
+ in_channels,
+ branch_channels_rem,
+ kernel_size=1,
+ padding=0,
+ stride=(stride, 1)), nn.BatchNorm2d(branch_channels_rem)))
+
+ # Residual connection
+ if not residual:
+ self.residual = lambda x: 0
+ elif (in_channels == out_channels) and (stride == 1):
+ self.residual = lambda x: x
+ else:
+ self.residual = unit_tcn(
+ in_channels, out_channels, kernel_size=1, stride=stride)
+
+ self.act = build_activation_layer(act_cfg)
+ self.drop = nn.Dropout(tcn_dropout)
+
+ def forward(self, x):
+ # Input dim: (N,C,T,V)
+ res = self.residual(x)
+ branch_outs = []
+ for tempconv in self.branches:
+ out = tempconv(x)
+ branch_outs.append(out)
+
+ out = torch.cat(branch_outs, dim=1)
+ out += res
+ out = self.act(out)
+ out = self.drop(out)
+ return out
+
+
+class UnfoldTemporalWindows(BaseModule):
+
+ def __init__(self, window_size, window_stride, window_dilation=1):
+ super().__init__()
+ self.window_size = window_size
+ self.window_stride = window_stride
+ self.window_dilation = window_dilation
+
+ self.padding = (window_size + (window_size - 1) *
+ (window_dilation - 1) - 1) // 2
+ self.unfold = nn.Unfold(
+ kernel_size=(self.window_size, 1),
+ dilation=(self.window_dilation, 1),
+ stride=(self.window_stride, 1),
+ padding=(self.padding, 0))
+
+ def forward(self, x):
+ # Input shape: (N,C,T,V), out: (N,C,T,V*window_size)
+ N, C, T, V = x.shape
+ x = self.unfold(x)
+ # Permute extra channels from window size to the graph dimension;
+ # -1 for number of windows
+ x = x.reshape(N, C, self.window_size, -1, V).permute(0, 1, 3, 2,
+ 4).contiguous()
+ x = x.reshape(N, C, -1, self.window_size * V)
+ return x
+
+
+class ST_MSGCN(BaseModule):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ A,
+ num_scales,
+ window_size,
+ residual=False,
+ dropout=0,
+ act_cfg=dict(type='ReLU')):
+
+ super().__init__()
+ self.num_scales = num_scales
+ self.window_size = window_size
+ A = self.build_st_graph(A, window_size)
+
+ A_scales = [
+ k_adjacency(A, k, with_self=True) for k in range(num_scales)
+ ]
+ A_scales = np.stack([normalize_digraph(g) for g in A_scales])
+
+ self.register_buffer('A', torch.Tensor(A_scales))
+ self.V = len(A)
+
+ self.PA = nn.Parameter(self.A.clone())
+ nn.init.uniform_(self.PA, -1e-6, 1e-6)
+
+ self.mlp = MLP(
+ in_channels * num_scales, [out_channels],
+ dropout=dropout,
+ act_cfg=act_cfg)
+
+ # Residual connection
+ if not residual:
+ self.residual = lambda x: 0
+ elif (in_channels == out_channels):
+ self.residual = lambda x: x
+ else:
+ self.residual = MLP(in_channels, [out_channels], act_cfg=None)
+
+ self.act = build_activation_layer(act_cfg)
+
+ def build_st_graph(self, A, window_size):
+ if not isinstance(A, np.ndarray):
+ A = A.data.cpu().numpy()
+
+ assert len(A.shape) == 2 and A.shape[0] == A.shape[1]
+ V = len(A)
+ A_with_I = A + np.eye(V, dtype=A.dtype)
+
+ A_large = np.tile(A_with_I, (window_size, window_size)).copy()
+ return A_large
+
+ def forward(self, x):
+ N, C, T, V = x.shape # T = number of windows, V = self.V * window_size
+ A = self.A + self.PA
+
+ # Perform Graph Convolution
+ res = self.residual(x)
+ agg = torch.einsum('kvu,nctv->nkctu', A, x)
+ agg = agg.reshape(N, self.num_scales * C, T, V)
+ out = self.mlp(agg)
+ if res == 0:
+ return self.act(out)
+ else:
+ return self.act(out + res)
+
+
+class MSG3DBlock(BaseModule):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ A,
+ num_scales,
+ window_size,
+ window_stride,
+ window_dilation,
+ embed_factor=1,
+ activation='relu'):
+
+ super().__init__()
+ self.window_size = window_size
+ self.out_channels = out_channels
+ self.embed_channels_in = out_channels // embed_factor
+ self.embed_channels_out = out_channels // embed_factor
+ if embed_factor == 1:
+ self.in1x1 = nn.Identity()
+ self.embed_channels_in = self.embed_channels_out = in_channels
+ # The first STGC block changes channels right away;
+ # others change at collapse
+ if in_channels == 3:
+ self.embed_channels_out = out_channels
+ else:
+ self.in1x1 = MLP(in_channels, [self.embed_channels_in])
+
+ self.gcn3d = Sequential(
+ UnfoldTemporalWindows(window_size, window_stride, window_dilation),
+ ST_MSGCN(
+ in_channels=self.embed_channels_in,
+ out_channels=self.embed_channels_out,
+ A=A,
+ num_scales=num_scales,
+ window_size=window_size))
+
+ self.out_conv = nn.Conv3d(
+ self.embed_channels_out,
+ out_channels,
+ kernel_size=(1, self.window_size, 1))
+ self.out_bn = nn.BatchNorm2d(out_channels)
+
+ def forward(self, x):
+ N, _, T, V = x.shape
+ x = self.in1x1(x)
+ # Construct temporal windows and apply MS-GCN
+ x = self.gcn3d(x)
+
+ # Collapse the window dimension
+ x = x.reshape(N, self.embed_channels_out, -1, self.window_size, V)
+ x = self.out_conv(x).squeeze(dim=3)
+ x = self.out_bn(x)
+ # no activation
+ return x
+
+
+class MW_MSG3DBlock(BaseModule):
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ A,
+ num_scales,
+ window_sizes=[3, 5],
+ window_stride=1,
+ window_dilations=[1, 1]):
+
+ super().__init__()
+ self.gcn3d = ModuleList([
+ MSG3DBlock(in_channels, out_channels, A, num_scales, window_size,
+ window_stride, window_dilation) for window_size,
+ window_dilation in zip(window_sizes, window_dilations)
+ ])
+
+ def forward(self, x):
+ out_sum = 0
+ for gcn3d in self.gcn3d:
+ out_sum += gcn3d(x)
+ return out_sum
diff --git a/src/pytorch-sphinx-theme b/src/pytorch-sphinx-theme
new file mode 160000
index 0000000000..6f42dcf38c
--- /dev/null
+++ b/src/pytorch-sphinx-theme
@@ -0,0 +1 @@
+Subproject commit 6f42dcf38c529653bdf3347f551cb037a1a0f1cf
diff --git a/tests/datasets/transforms/test_formating.py b/tests/datasets/transforms/test_formating.py
index 842d2dbf27..8e741c24e5 100644
--- a/tests/datasets/transforms/test_formating.py
+++ b/tests/datasets/transforms/test_formating.py
@@ -101,8 +101,8 @@ def test_repr(self):
type='PackActionInputs', meta_keys=['flip_direction', 'img_shape'])
transform = TRANSFORMS.build(cfg)
self.assertEqual(
- repr(transform),
- "PackActionInputs(meta_keys=['flip_direction', 'img_shape'])")
+ repr(transform), 'PackActionInputs(collect_keys=None, '
+ "meta_keys=['flip_direction', 'img_shape'])")
class TestPackLocalizationInputs(unittest.TestCase):
@@ -184,8 +184,24 @@ def test_format_shape():
target_keys = ['imgs', 'input_shape']
assert assert_dict_has_keys(results, target_keys)
- assert repr(format_shape) == format_shape.__class__.__name__ + \
- "(input_format='NCTHW')"
+ # `NCTHW` input format with imgs and heatmap_imgs
+ results = dict(
+ imgs=np.random.randn(6, 224, 224, 3),
+ heatmap_imgs=np.random.randn(12, 17, 56, 56),
+ num_clips=2,
+ clip_len=dict(RGB=3, Pose=6))
+
+ results = format_shape(results)
+ assert results['input_shape'] == (2, 3, 3, 224, 224)
+ assert results['heatmap_input_shape'] == (2, 17, 6, 56, 56)
+
+ assert repr(format_shape) == "FormatShape(input_format='NCTHW')"
+
+ # `NCTHW_Heatmap` input format
+ results = dict(
+ imgs=np.random.randn(12, 17, 56, 56), num_clips=2, clip_len=6)
+ format_shape = FormatShape('NCTHW_Heatmap')
+ assert format_shape(results)['input_shape'] == (2, 17, 6, 56, 56)
# `NCHW_Flow` input format
results = dict(imgs=np.random.randn(6, 224, 224), num_clips=1, clip_len=3)
diff --git a/tests/datasets/transforms/test_loading.py b/tests/datasets/transforms/test_loading.py
index 5413475a92..035a2213cc 100644
--- a/tests/datasets/transforms/test_loading.py
+++ b/tests/datasets/transforms/test_loading.py
@@ -260,21 +260,23 @@ def test_pims_decode(self):
video_result['frame_inds']), 256, 340, 3)
def test_decord_init(self):
- target_keys = ['video_reader', 'total_frames']
+ target_keys = ['video_reader', 'total_frames', 'avg_fps']
video_result = copy.deepcopy(self.video_results)
decord_init = DecordInit()
decord_init_result = decord_init(video_result)
assert assert_dict_has_keys(decord_init_result, target_keys)
assert decord_init_result['total_frames'] == len(
decord_init_result['video_reader'])
+ assert decord_init_result['avg_fps'] == 30
+
assert repr(decord_init) == (f'{decord_init.__class__.__name__}('
f'io_backend=disk, '
- f'num_threads={1})')
+ f'num_threads=1)')
def test_decord_decode(self):
target_keys = ['frame_inds', 'imgs', 'original_shape']
- # test Decord with 2 dim input and start_index = 0
+ # test Decord with 2 dim input using accurate mode
video_result = copy.deepcopy(self.video_results)
video_result['frame_inds'] = np.arange(0, self.total_frames,
3)[:, np.newaxis]
@@ -289,7 +291,7 @@ def test_decord_decode(self):
assert np.shape(decord_decode_result['imgs']) == (len(
video_result['frame_inds']), 256, 340, 3)
- # test Decord with 1 dim input and start_index = 0
+ # test Decord with 1 dim input using accurate mode
video_result = copy.deepcopy(self.video_results)
video_result['frame_inds'] = np.arange(0, self.total_frames, 3)
decord_init = DecordInit()
@@ -303,7 +305,7 @@ def test_decord_decode(self):
assert np.shape(decord_decode_result['imgs']) == (len(
video_result['frame_inds']), 256, 340, 3)
- # test Decord with 2 dim input and start_index = 0
+ # test Decord with 2 dim input using efficient mode
video_result = copy.deepcopy(self.video_results)
video_result['frame_inds'] = np.arange(0, self.total_frames,
3)[:, np.newaxis]
@@ -311,14 +313,14 @@ def test_decord_decode(self):
decord_init_result = decord_init(video_result)
video_result['video_reader'] = decord_init_result['video_reader']
- decord_decode = DecordDecode()
+ decord_decode = DecordDecode(mode='efficient')
decord_decode_result = decord_decode(video_result)
assert assert_dict_has_keys(decord_decode_result, target_keys)
assert decord_decode_result['original_shape'] == (256, 340)
assert np.shape(decord_decode_result['imgs']) == (len(
video_result['frame_inds']), 256, 340, 3)
- # test Decord with 1 dim input
+ # test Decord with 1 dim input using efficient mode
video_result = copy.deepcopy(self.video_results)
video_result['frame_inds'] = np.arange(1, self.total_frames, 3)
decord_init = DecordInit()
diff --git a/tests/datasets/transforms/test_pose_transforms.py b/tests/datasets/transforms/test_pose_transforms.py
index d65d450124..913447f938 100644
--- a/tests/datasets/transforms/test_pose_transforms.py
+++ b/tests/datasets/transforms/test_pose_transforms.py
@@ -13,10 +13,11 @@
from mmaction.datasets.transforms import (GeneratePoseTarget, GenSkeFeat,
JointToBone, LoadKineticsPose,
- MergeSkeFeat, PadTo, PoseCompact,
- PoseDecode, PreNormalize2D,
- PreNormalize3D, ToMotion,
- UniformSampleFrames)
+ MergeSkeFeat, MMCompact, MMDecode,
+ MMUniformSampleFrames, PadTo,
+ PoseCompact, PoseDecode,
+ PreNormalize2D, PreNormalize3D,
+ ToMotion, UniformSampleFrames)
class TestPoseTransforms:
@@ -126,23 +127,29 @@ def test_generate_pose_target():
modality='Pose')
generate_pose_target = GeneratePoseTarget(
- sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=())
+ sigma=1,
+ with_kp=True,
+ left_kp=(1, ),
+ right_kp=(2, ),
+ left_limb=(0, ),
+ right_limb=(1, ),
+ skeletons=())
assert str(generate_pose_target) == ('GeneratePoseTarget(sigma=1, '
'use_score=True, with_kp=True, '
'with_limb=False, skeletons=(), '
- 'double=False, left_kp=(0,), '
- 'right_kp=(1,))')
- return_results = generate_pose_target(results)
- assert return_results['imgs'].shape == (8, 64, 64, 3)
+ 'double=False, left_kp=(1,), '
+ 'right_kp=(2,), left_limb=(0,), '
+ 'right_limb=(1,), scaling=1.0)')
+ return_results = generate_pose_target(copy.deepcopy(results))
+ assert return_results['imgs'].shape == (8, 3, 64, 64)
assert_array_almost_equal(return_results['imgs'][0],
return_results['imgs'][1])
results = dict(img_shape=img_shape, keypoint=kp, modality='Pose')
- generate_pose_target = GeneratePoseTarget(
- sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=())
- return_results = generate_pose_target(results)
- assert return_results['imgs'].shape == (8, 64, 64, 3)
+ generate_pose_target = GeneratePoseTarget(sigma=1, with_kp=True)
+ return_results = generate_pose_target(copy.deepcopy(results))
+ assert return_results['imgs'].shape == (8, 3, 64, 64)
assert_array_almost_equal(return_results['imgs'][0],
return_results['imgs'][1])
@@ -150,37 +157,23 @@ def test_generate_pose_target():
sigma=1,
with_kp=False,
with_limb=True,
- left_kp=(0, ),
- right_kp=(1, ),
skeletons=((0, 1), (1, 2), (0, 2)))
- return_results = generate_pose_target(results)
- assert return_results['imgs'].shape == (8, 64, 64, 3)
+ return_results = generate_pose_target(copy.deepcopy(results))
+ assert return_results['imgs'].shape == (8, 3, 64, 64)
assert_array_almost_equal(return_results['imgs'][0],
return_results['imgs'][1])
generate_pose_target = GeneratePoseTarget(
sigma=1,
- with_kp=True,
- with_limb=True,
- left_kp=(0, ),
- right_kp=(1, ),
- skeletons=((0, 1), (1, 2), (0, 2)))
- return_results = generate_pose_target(results)
- assert return_results['imgs'].shape == (8, 64, 64, 6)
- assert_array_almost_equal(return_results['imgs'][0],
- return_results['imgs'][1])
-
- generate_pose_target = GeneratePoseTarget(
- sigma=1,
- with_kp=True,
+ with_kp=False,
with_limb=True,
double=True,
- left_kp=(0, ),
- right_kp=(1, ),
+ left_limb=(0, ),
+ right_limb=(1, ),
skeletons=((0, 1), (1, 2), (0, 2)))
- return_results = generate_pose_target(results)
+ return_results = generate_pose_target(copy.deepcopy(results))
imgs = return_results['imgs']
- assert imgs.shape == (16, 64, 64, 6)
+ assert imgs.shape == (16, 3, 64, 64)
assert_array_almost_equal(imgs[0], imgs[1])
assert_array_almost_equal(imgs[:8, 2], imgs[8:, 2, :, ::-1])
assert_array_almost_equal(imgs[:8, 0], imgs[8:, 1, :, ::-1])
@@ -197,8 +190,8 @@ def test_generate_pose_target():
keypoint_score=kpscore,
modality='Pose')
generate_pose_target = GeneratePoseTarget(
- sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=())
- return_results = generate_pose_target(results)
+ sigma=1, with_kp=True, skeletons=())
+ return_results = generate_pose_target(copy.deepcopy(results))
assert_array_almost_equal(return_results['imgs'], 0)
img_shape = (64, 64)
@@ -215,10 +208,8 @@ def test_generate_pose_target():
sigma=1,
with_kp=False,
with_limb=True,
- left_kp=(0, ),
- right_kp=(1, ),
skeletons=((0, 1), (1, 2), (0, 2)))
- return_results = generate_pose_target(results)
+ return_results = generate_pose_target(copy.deepcopy(results))
assert_array_almost_equal(return_results['imgs'], 0)
img_shape = (64, 64)
@@ -231,13 +222,12 @@ def test_generate_pose_target():
keypoint=kp,
keypoint_score=kpscore,
modality='Pose')
- generate_pose_target = GeneratePoseTarget(
- sigma=1, with_kp=True, left_kp=(0, ), right_kp=(1, ), skeletons=())
- return_results = generate_pose_target(results)
+ generate_pose_target = GeneratePoseTarget(sigma=1, with_kp=True)
+ return_results = generate_pose_target(copy.deepcopy(results))
assert_array_almost_equal(return_results['imgs'], 0)
img_shape = (64, 64)
- kp = np.array([[[[124, 124], [140, 140], [124, 140]]]])
+ kp = np.array([[[[124., 124.], [140., 140.], [124., 140.]]]])
kpscore = np.array([[[0., 0., 0.]]])
kp = np.concatenate([kp] * 8, axis=1)
kpscore = np.concatenate([kpscore] * 8, axis=1)
@@ -250,8 +240,6 @@ def test_generate_pose_target():
sigma=1,
with_kp=False,
with_limb=True,
- left_kp=(0, ),
- right_kp=(1, ),
skeletons=((0, 1), (1, 2), (0, 2)))
return_results = generate_pose_target(results)
assert_array_almost_equal(return_results['imgs'], 0)
@@ -587,3 +575,143 @@ def test_pose_decode():
decode_results = pose_decode(results)
assert_array_almost_equal(decode_results['keypoint'], kp)
assert_array_almost_equal(decode_results['keypoint_score'], kpscore)
+
+ @staticmethod
+ def test_mm_uniform_sample_frames():
+ results = dict(total_frames=64, modality='Pose')
+ sampling = MMUniformSampleFrames(
+ clip_len=dict(RGB=8, Pose=32), num_clips=1, test_mode=True, seed=0)
+ assert repr(sampling) == ('MMUniformSampleFrames('
+ "clip_len={'RGB': 8, 'Pose': 32}, "
+ 'num_clips=1, test_mode=True, seed=0)')
+
+ sampling_results = sampling(results)
+ assert sampling_results['clip_len'] == dict(RGB=8, Pose=32)
+ assert sampling_results['frame_interval'] is None
+ assert sampling_results['num_clips'] == 1
+ assert sampling_results['modality'] == ['RGB', 'Pose']
+ assert_array_equal(sampling_results['RGB_inds'],
+ np.array([4, 15, 21, 24, 35, 43, 51, 63]))
+ assert_array_equal(
+ sampling_results['Pose_inds'],
+ np.array([
+ 0, 3, 5, 6, 9, 11, 13, 15, 17, 19, 21, 22, 24, 27, 28, 30, 32,
+ 34, 36, 39, 40, 43, 45, 46, 48, 51, 53, 55, 57, 58, 61, 62
+ ]))
+
+ results = dict(total_frames=64, modality='Pose')
+ sampling = MMUniformSampleFrames(
+ clip_len=dict(RGB=8, Pose=32),
+ num_clips=10,
+ test_mode=True,
+ seed=0)
+ sampling_results = sampling(results)
+ assert sampling_results['clip_len'] == dict(RGB=8, Pose=32)
+ assert sampling_results['frame_interval'] is None
+ assert sampling_results['num_clips'] == 10
+ assert sampling_results['modality'] == ['RGB', 'Pose']
+ assert len(sampling_results['RGB_inds']) == 80
+ assert len(sampling_results['Pose_inds']) == 320
+
+ results = dict(total_frames=64, modality='Pose')
+ sampling = MMUniformSampleFrames(
+ clip_len=dict(RGB=8, Pose=32), num_clips=1, test_mode=False)
+ sampling_results = sampling(results)
+ assert sampling_results['clip_len'] == dict(RGB=8, Pose=32)
+ assert sampling_results['frame_interval'] is None
+ assert sampling_results['num_clips'] == 1
+ assert len(sampling_results['RGB_inds']) == 8
+ assert len(sampling_results['Pose_inds']) == 32
+
+ @staticmethod
+ def test_mm_decode():
+ mm_decode = MMDecode()
+
+ # Pose only test
+ pose_raw_results = dict(
+ modality=['Pose'],
+ Pose_inds=np.array([2, 4, 6, 8, 10]),
+ keypoint=np.random.random([1, 16, 17, 2]),
+ img_shape=(1080, 1920))
+ rgb_raw_results = dict(
+ modality=['RGB'],
+ RGB_inds=np.array([2, 4, 6, 8, 10]),
+ frame_dir=osp.join(osp.dirname(__file__), '../../data/test'))
+
+ # test pose w/o `keypoint_score`
+ mm_decode(copy.deepcopy(pose_raw_results))
+
+ # test pose with `keypoint_score`
+ pose_raw_results['keypoint_score'] = np.random.random([1, 16, 17])
+ pose_results = mm_decode(copy.deepcopy(pose_raw_results))
+
+ # test rgb
+ rgb_results = mm_decode(copy.deepcopy(rgb_raw_results))
+
+ # test pose and rgb
+ pose_rgb_raw_results = {
+ **rgb_raw_results,
+ **pose_raw_results, 'modality': ['RGB', 'Pose']
+ }
+ pose_rgb_results = mm_decode(copy.deepcopy(pose_rgb_raw_results))
+
+ assert_array_equal(pose_rgb_results['keypoint_score'],
+ pose_results['keypoint_score'])
+ scaled_keypoint = copy.deepcopy(pose_results['keypoint'])
+ oh, ow = pose_results['img_shape']
+ nh, nw = pose_rgb_results['img_shape']
+ scaled_keypoint[..., 0] *= (nw / ow)
+ scaled_keypoint[..., 1] *= (nh / oh)
+ assert_array_equal(pose_rgb_results['keypoint'], scaled_keypoint)
+ assert_array_equal(pose_rgb_results['imgs'], rgb_results['imgs'])
+ assert assert_dict_has_keys(
+ pose_rgb_results, ['filename', 'img_shape', 'original_shape'])
+ assert repr(mm_decode) == 'MMDecode(io_backend=disk)'
+
+ @staticmethod
+ def test_mm_compact():
+ results = {}
+ results['img_shape'] = (100, 100)
+ fake_kp = np.zeros([1, 4, 2, 2])
+ fake_kp[:, :, 0] = [10, 10]
+ fake_kp[:, :, 1] = [90, 90]
+ results['keypoint'] = fake_kp
+ results['imgs'] = list(np.zeros([3, 100, 100, 3]))
+
+ pose_compact = MMCompact(
+ padding=0, threshold=0, hw_ratio=1, allow_imgpad=False)
+ inp = copy.deepcopy(results)
+ ret = pose_compact(inp)
+ assert ret['img_shape'] == (80, 80)
+ assert ret['imgs'][0].shape[:-1] == (80, 80)
+ assert str(pose_compact) == (
+ 'MMCompact(padding=0, threshold=0, hw_ratio=(1, 1), '
+ 'allow_imgpad=False)')
+
+ pose_compact = MMCompact(
+ padding=0.3, threshold=0, hw_ratio=1, allow_imgpad=False)
+ inp = copy.deepcopy(results)
+ ret = pose_compact(inp)
+ assert ret['img_shape'] == (100, 100)
+ assert ret['imgs'][0].shape[:-1] == (100, 100)
+
+ pose_compact = MMCompact(
+ padding=0.3, threshold=0, hw_ratio=1, allow_imgpad=True)
+ inp = copy.deepcopy(results)
+ ret = pose_compact(inp)
+ assert ret['img_shape'] == (104, 104)
+ assert ret['imgs'][0].shape[:-1] == (104, 104)
+
+ pose_compact = MMCompact(
+ padding=0, threshold=100, hw_ratio=1, allow_imgpad=False)
+ inp = copy.deepcopy(results)
+ ret = pose_compact(inp)
+ assert ret['img_shape'] == (100, 100)
+ assert ret['imgs'][0].shape[:-1] == (100, 100)
+
+ pose_compact = MMCompact(
+ padding=0, threshold=0, hw_ratio=0.75, allow_imgpad=True)
+ inp = copy.deepcopy(results)
+ ret = pose_compact(inp)
+ assert ret['img_shape'] == (80, 106)
+ assert ret['imgs'][0].shape[:-1] == (80, 106)
diff --git a/tests/evaluation/metrics/test_acc_metric.py b/tests/evaluation/metrics/test_acc_metric.py
index 273155858c..7c70adb7d6 100644
--- a/tests/evaluation/metrics/test_acc_metric.py
+++ b/tests/evaluation/metrics/test_acc_metric.py
@@ -1,7 +1,12 @@
# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import numpy as np
import torch
-from mmaction.evaluation import AccMetric
+from mmaction.evaluation import AccMetric, ConfusionMatrix
+from mmaction.registry import METRICS
+from mmaction.structures import ActionDataSample
def generate_data(num_classes=5, random_label=False):
@@ -41,3 +46,113 @@ def test_accmetric():
assert eval_results['mean1'] == 1.0
assert eval_results['mmit_mean_average_precision'] == 1.0
return
+
+
+class TestConfusionMatrix(TestCase):
+
+ def test_evaluate(self):
+ """Test using the metric in the same way as Evalutor."""
+ pred = [
+ ActionDataSample().set_pred_score(i).set_pred_label(
+ j).set_gt_labels(k).to_dict() for i, j, k in zip([
+ torch.tensor([0.7, 0.0, 0.3]),
+ torch.tensor([0.5, 0.2, 0.3]),
+ torch.tensor([0.4, 0.5, 0.1]),
+ torch.tensor([0.0, 0.0, 1.0]),
+ torch.tensor([0.0, 0.0, 1.0]),
+ torch.tensor([0.0, 0.0, 1.0]),
+ ], [0, 0, 1, 2, 2, 2], [0, 0, 1, 2, 1, 0])
+ ]
+
+ # Test with score (use score instead of label if score exists)
+ metric = METRICS.build(dict(type='ConfusionMatrix'))
+ metric.process(None, pred)
+ res = metric.evaluate(6)
+ self.assertIsInstance(res, dict)
+ self.assertTensorEqual(
+ res['confusion_matrix/result'],
+ torch.tensor([
+ [2, 0, 1],
+ [0, 1, 1],
+ [0, 0, 1],
+ ]))
+
+ # Test with label
+ for sample in pred:
+ del sample['pred_scores']
+ metric = METRICS.build(dict(type='ConfusionMatrix'))
+ metric.process(None, pred)
+ with self.assertRaisesRegex(AssertionError,
+ 'Please specify the `num_classes`'):
+ metric.evaluate(6)
+
+ metric = METRICS.build(dict(type='ConfusionMatrix', num_classes=3))
+ metric.process(None, pred)
+ self.assertIsInstance(res, dict)
+ self.assertTensorEqual(
+ res['confusion_matrix/result'],
+ torch.tensor([
+ [2, 0, 1],
+ [0, 1, 1],
+ [0, 0, 1],
+ ]))
+
+ def test_calculate(self):
+ y_true = np.array([0, 0, 1, 2, 1, 0])
+ y_label = torch.tensor([0, 0, 1, 2, 2, 2])
+ y_score = [
+ [0.7, 0.0, 0.3],
+ [0.5, 0.2, 0.3],
+ [0.4, 0.5, 0.1],
+ [0.0, 0.0, 1.0],
+ [0.0, 0.0, 1.0],
+ [0.0, 0.0, 1.0],
+ ]
+
+ # Test with score
+ cm = ConfusionMatrix.calculate(y_score, y_true)
+ self.assertIsInstance(cm, torch.Tensor)
+ self.assertTensorEqual(
+ cm, torch.tensor([
+ [2, 0, 1],
+ [0, 1, 1],
+ [0, 0, 1],
+ ]))
+
+ # Test with label
+ with self.assertRaisesRegex(AssertionError,
+ 'Please specify the `num_classes`'):
+ ConfusionMatrix.calculate(y_label, y_true)
+
+ cm = ConfusionMatrix.calculate(y_label, y_true, num_classes=3)
+ self.assertIsInstance(cm, torch.Tensor)
+ self.assertTensorEqual(
+ cm, torch.tensor([
+ [2, 0, 1],
+ [0, 1, 1],
+ [0, 0, 1],
+ ]))
+
+ # Test with invalid inputs
+ with self.assertRaisesRegex(TypeError, " is not"):
+ ConfusionMatrix.calculate(y_label, 'hi')
+
+ def test_plot(self):
+ import matplotlib.pyplot as plt
+
+ cm = torch.tensor([[2, 0, 1], [0, 1, 1], [0, 0, 1]])
+ fig = ConfusionMatrix.plot(cm, include_values=True, show=False)
+
+ self.assertIsInstance(fig, plt.Figure)
+
+ def assertTensorEqual(self,
+ tensor: torch.Tensor,
+ value: float,
+ msg=None,
+ **kwarg):
+ tensor = tensor.to(torch.float32)
+ value = torch.tensor(value).float()
+ try:
+ torch.testing.assert_allclose(tensor, value, **kwarg)
+ except AssertionError as e:
+ self.fail(self._formatMessage(msg, str(e)))
diff --git a/tests/evaluation/metrics/test_metric_utils.py b/tests/evaluation/metrics/test_metric_utils.py
index 091a728bc4..5eeb12e199 100644
--- a/tests/evaluation/metrics/test_metric_utils.py
+++ b/tests/evaluation/metrics/test_metric_utils.py
@@ -151,7 +151,7 @@ def gt_confusion_matrix(gt_labels, pred_labels, normalize=None):
confusion_mat = np.delete(confusion_mat, del_index, axis=1)
if normalize is not None:
- confusion_mat = np.array(confusion_mat, dtype=np.float)
+ confusion_mat = np.array(confusion_mat, dtype=np.float64)
m, n = confusion_mat.shape
if normalize == 'true':
for i in range(m):
diff --git a/tests/models/backbones/test_resnet3d_slowfast.py b/tests/models/backbones/test_resnet3d_slowfast.py
index a3de73a620..d91e183583 100644
--- a/tests/models/backbones/test_resnet3d_slowfast.py
+++ b/tests/models/backbones/test_resnet3d_slowfast.py
@@ -11,18 +11,13 @@ def test_slowfast_backbone():
"""Test SlowFast backbone."""
with pytest.raises(TypeError):
# cfg should be a dict
- ResNet3dSlowFast(None, slow_pathway=list(['foo', 'bar']))
- with pytest.raises(TypeError):
- # pretrained should be a str
- sf_50 = ResNet3dSlowFast(dict(foo='bar'))
- sf_50.init_weights()
+ ResNet3dSlowFast(slow_pathway=list(['foo', 'bar']))
with pytest.raises(KeyError):
# pathway type should be implemented
- ResNet3dSlowFast(None, slow_pathway=dict(type='resnext'))
+ ResNet3dSlowFast(slow_pathway=dict(type='resnext'))
# test slowfast with slow inflated
sf_50_inflate = ResNet3dSlowFast(
- None,
slow_pathway=dict(
type='resnet3d',
depth=50,
@@ -56,14 +51,7 @@ def test_slowfast_backbone():
# slowfast w/o lateral connection inference test
input_shape = (1, 3, 8, 64, 64)
imgs = generate_backbone_demo_inputs(input_shape)
- # parrots 3dconv is only implemented on gpu
- if torch.__version__ == 'parrots':
- if torch.cuda.is_available():
- sf_50_wo_lateral = sf_50_wo_lateral.cuda()
- imgs_gpu = imgs.cuda()
- feat = sf_50_wo_lateral(imgs_gpu)
- else:
- feat = sf_50_wo_lateral(imgs)
+ feat = sf_50_wo_lateral(imgs)
assert isinstance(feat, tuple)
assert feat[0].shape == torch.Size([1, 2048, 1, 2, 2])
@@ -104,21 +92,14 @@ def test_slowfast_backbone():
assert param.requires_grad is True
# test slowfast with normal config
- sf_50 = ResNet3dSlowFast(None)
+ sf_50 = ResNet3dSlowFast()
sf_50.init_weights()
sf_50.train()
# slowfast inference test
input_shape = (1, 3, 8, 64, 64)
imgs = generate_backbone_demo_inputs(input_shape)
- # parrots 3dconv is only implemented on gpu
- if torch.__version__ == 'parrots':
- if torch.cuda.is_available():
- sf_50 = sf_50.cuda()
- imgs_gpu = imgs.cuda()
- feat = sf_50(imgs_gpu)
- else:
- feat = sf_50(imgs)
+ feat = sf_50(imgs)
assert isinstance(feat, tuple)
assert feat[0].shape == torch.Size([1, 2048, 1, 2, 2])
diff --git a/tests/models/backbones/test_resnet3d_slowonly.py b/tests/models/backbones/test_resnet3d_slowonly.py
index 9603469c37..47c7036451 100644
--- a/tests/models/backbones/test_resnet3d_slowonly.py
+++ b/tests/models/backbones/test_resnet3d_slowonly.py
@@ -10,7 +10,7 @@ def test_slowonly_backbone():
"""Test SlowOnly backbone."""
with pytest.raises(AssertionError):
# SlowOnly should contain no lateral connection
- ResNet3dSlowOnly(50, None, lateral=True)
+ ResNet3dSlowOnly(depth=50, pretrained=None, lateral=True)
# test SlowOnly for PoseC3D
so_50 = ResNet3dSlowOnly(
@@ -31,7 +31,7 @@ def test_slowonly_backbone():
so_50.train()
# test SlowOnly with normal config
- so_50 = ResNet3dSlowOnly(50, None)
+ so_50 = ResNet3dSlowOnly(depth=50, pretrained=None)
so_50.init_weights()
so_50.train()
diff --git a/tests/models/backbones/test_rgbposeconv3d.py b/tests/models/backbones/test_rgbposeconv3d.py
new file mode 100644
index 0000000000..848a73ab45
--- /dev/null
+++ b/tests/models/backbones/test_rgbposeconv3d.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmaction.models import RGBPoseConv3D
+from mmaction.testing import generate_backbone_demo_inputs
+
+
+def test_rgbposeconv3d():
+ """Test RGBPoseConv3D backbone."""
+
+ with pytest.raises(AssertionError):
+ RGBPoseConv3D(pose_drop_path=1.1, rgb_drop_path=1.1)
+
+ rgbposec3d = RGBPoseConv3D()
+ rgbposec3d.init_weights()
+ rgbposec3d.train()
+
+ imgs_shape = (1, 3, 8, 224, 224)
+ heatmap_imgs_shape = (1, 17, 32, 56, 56)
+ imgs = generate_backbone_demo_inputs(imgs_shape)
+ heatmap_imgs = generate_backbone_demo_inputs(heatmap_imgs_shape)
+
+ (x_rgb, x_pose) = rgbposec3d(imgs, heatmap_imgs)
+
+ assert x_rgb.shape == torch.Size([1, 2048, 8, 7, 7])
+ assert x_pose.shape == torch.Size([1, 512, 32, 7, 7])
diff --git a/tests/models/backbones/test_uniformerv2.py b/tests/models/backbones/test_uniformerv2.py
index 3345892eb7..4858001c4d 100644
--- a/tests/models/backbones/test_uniformerv2.py
+++ b/tests/models/backbones/test_uniformerv2.py
@@ -28,6 +28,7 @@ def test_uniformerv2_backbone():
n_head=12,
mlp_factor=4.,
drop_path_rate=0.,
+ clip_pretrained=False,
mlp_dropout=[0.5, 0.5, 0.5, 0.5])
model.init_weights()
@@ -56,6 +57,7 @@ def test_uniformerv2_backbone():
n_head=12,
mlp_factor=4.,
drop_path_rate=0.,
+ clip_pretrained=False,
mlp_dropout=[0.5, 0.5, 0.5, 0.5])
model.init_weights()
diff --git a/tests/models/data_preprocessors/__init__.py b/tests/models/data_preprocessors/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/tests/models/data_preprocessors/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/tests/models/data_preprocessors/test_data_preprocessor.py b/tests/models/data_preprocessors/test_data_preprocessor.py
new file mode 100644
index 0000000000..a4a3d851d7
--- /dev/null
+++ b/tests/models/data_preprocessors/test_data_preprocessor.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+
+import pytest
+import torch
+from numpy.testing import assert_array_equal
+
+from mmaction.models import ActionDataPreprocessor
+from mmaction.structures import ActionDataSample
+from mmaction.utils import register_all_modules
+
+
+def generate_dummy_data(batch_size, input_shape):
+ data = {
+ 'inputs':
+ [torch.randint(0, 255, input_shape) for _ in range(batch_size)],
+ 'data_samples':
+ [ActionDataSample().set_gt_labels(2) for _ in range(batch_size)]
+ }
+ return data
+
+
+def test_data_preprocessor():
+ with pytest.raises(ValueError):
+ ActionDataPreprocessor(
+ mean=[1, 1], std=[0, 0], format_shape='NCTHW_Heatmap')
+ with pytest.raises(ValueError):
+ psr = ActionDataPreprocessor(format_shape='NCTHW_Heatmap', to_rgb=True)
+ psr(generate_dummy_data(1, (3, 224, 224)))
+
+ raw_data = generate_dummy_data(2, (1, 3, 8, 224, 224))
+ psr = ActionDataPreprocessor(
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW')
+ data = psr(deepcopy(raw_data))
+ assert data['inputs'].shape == (2, 1, 3, 8, 224, 224)
+ assert_array_equal(data['inputs'][0],
+ (raw_data['inputs'][0] - psr.mean) / psr.std)
+ assert_array_equal(data['inputs'][1],
+ (raw_data['inputs'][1] - psr.mean) / psr.std)
+
+ psr = ActionDataPreprocessor(format_shape='NCTHW', to_rgb=True)
+ data = psr(deepcopy(raw_data))
+ assert data['inputs'].shape == (2, 1, 3, 8, 224, 224)
+ assert_array_equal(data['inputs'][0], raw_data['inputs'][0][:, [2, 1, 0]])
+ assert_array_equal(data['inputs'][1], raw_data['inputs'][1][:, [2, 1, 0]])
+
+ register_all_modules()
+ psr = ActionDataPreprocessor(
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW',
+ blending=dict(type='MixupBlending', num_classes=5))
+ data = psr(deepcopy(raw_data), training=True)
+ assert data['data_samples'][0].gt_labels.item.shape == (5, )
+ assert data['data_samples'][1].gt_labels.item.shape == (5, )
+
+ raw_data = generate_dummy_data(2, (1, 3, 224, 224))
+ psr = ActionDataPreprocessor(
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCHW',
+ to_rgb=True)
+ data = psr(deepcopy(raw_data))
+ assert_array_equal(data['inputs'][0],
+ (raw_data['inputs'][0][:, [2, 1, 0]] - psr.mean) /
+ psr.std)
+ assert_array_equal(data['inputs'][1],
+ (raw_data['inputs'][1][:, [2, 1, 0]] - psr.mean) /
+ psr.std)
+
+ psr = ActionDataPreprocessor()
+ data = psr(deepcopy(raw_data))
+ assert data['inputs'].shape == (2, 1, 3, 224, 224)
+ assert_array_equal(data['inputs'][0], raw_data['inputs'][0])
+ assert_array_equal(data['inputs'][1], raw_data['inputs'][1])
+
+ raw_2d_data = generate_dummy_data(2, (3, 224, 224))
+ raw_3d_data = generate_dummy_data(2, (1, 3, 8, 224, 224))
+ raw_data = (raw_2d_data, raw_3d_data)
+
+ psr = ActionDataPreprocessor(
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='MIX2d3d')
+ data = psr(raw_data)
+ assert_array_equal(data[0]['inputs'][0],
+ (raw_2d_data['inputs'][0] - psr.mean.view(-1, 1, 1)) /
+ psr.std.view(-1, 1, 1))
+ assert_array_equal(data[0]['inputs'][1],
+ (raw_2d_data['inputs'][1] - psr.mean.view(-1, 1, 1)) /
+ psr.std.view(-1, 1, 1))
+ assert_array_equal(data[1]['inputs'][0],
+ (raw_3d_data['inputs'][0] - psr.mean) / psr.std)
+ assert_array_equal(data[1]['inputs'][1],
+ (raw_3d_data['inputs'][1] - psr.mean) / psr.std)
diff --git a/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py b/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py
new file mode 100644
index 0000000000..35483bd5d9
--- /dev/null
+++ b/tests/models/data_preprocessors/test_multimodal_data_preprocessor.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import pytest
+import torch
+from numpy.testing import assert_array_equal
+
+from mmaction.models import MultiModalDataPreprocessor
+from mmaction.structures import ActionDataSample
+from mmaction.utils import register_all_modules
+
+
+def generate_dummy_data(batch_size, input_keys, input_shapes):
+ data = dict()
+ data['data_samples'] = [
+ ActionDataSample().set_gt_labels(2) for _ in range(batch_size)
+ ]
+ data['inputs'] = dict()
+ for key, shape in zip(input_keys, input_shapes):
+ data['inputs'][key] = [
+ torch.randint(0, 255, shape) for _ in range(batch_size)
+ ]
+
+ return data
+
+
+def test_multimodal_data_preprocessor():
+ with pytest.raises(AssertionError):
+ MultiModalDataPreprocessor(
+ preprocessors=dict(imgs=dict(format_shape='NCTHW')))
+
+ register_all_modules()
+ data_keys = ('imgs', 'heatmap_imgs')
+ data_shapes = ((1, 3, 8, 224, 224), (1, 17, 32, 64, 64))
+ raw_data = generate_dummy_data(2, data_keys, data_shapes)
+
+ psr = MultiModalDataPreprocessor(
+ preprocessors=dict(
+ imgs=dict(
+ type='ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCTHW'),
+ heatmap_imgs=dict(type='ActionDataPreprocessor')))
+
+ data = psr(copy.deepcopy(raw_data))
+ assert data['inputs']['imgs'].shape == (2, 1, 3, 8, 224, 224)
+ assert data['inputs']['heatmap_imgs'].shape == (2, 1, 17, 32, 64, 64)
+ psr_imgs = psr.preprocessors['imgs']
+ assert_array_equal(data['inputs']['imgs'][0],
+ (raw_data['inputs']['imgs'][0] - psr_imgs.mean) /
+ psr_imgs.std)
+ assert_array_equal(data['inputs']['imgs'][1],
+ (raw_data['inputs']['imgs'][1] - psr_imgs.mean) /
+ psr_imgs.std)
+ assert_array_equal(data['inputs']['heatmap_imgs'][0],
+ raw_data['inputs']['heatmap_imgs'][0])
+ assert_array_equal(data['inputs']['heatmap_imgs'][1],
+ raw_data['inputs']['heatmap_imgs'][1])
+
+ data_keys = ('imgs_2D', 'imgs_3D')
+ data_shapes = ((1, 3, 224, 224), (1, 3, 8, 224, 224))
+ raw_data = generate_dummy_data(2, data_keys, data_shapes)
+
+ psr = MultiModalDataPreprocessor(
+ preprocessors=dict(
+ imgs_2D=dict(
+ type='ActionDataPreprocessor',
+ mean=[123.675, 116.28, 103.53],
+ std=[58.395, 57.12, 57.375],
+ format_shape='NCHW'),
+ imgs_3D=dict(
+ type='ActionDataPreprocessor',
+ mean=[127.5, 127.5, 127.5],
+ std=[57.5, 57.5, 57.5],
+ format_shape='NCTHW')))
+
+ data = psr(copy.deepcopy(raw_data))
+ assert data['inputs']['imgs_2D'].shape == (2, 1, 3, 224, 224)
+ assert data['inputs']['imgs_3D'].shape == (2, 1, 3, 8, 224, 224)
+ psr_imgs2d = psr.preprocessors['imgs_2D']
+ psr_imgs3d = psr.preprocessors['imgs_3D']
+ assert_array_equal(data['inputs']['imgs_2D'][0],
+ (raw_data['inputs']['imgs_2D'][0] - psr_imgs2d.mean) /
+ psr_imgs2d.std)
+ assert_array_equal(data['inputs']['imgs_2D'][1],
+ (raw_data['inputs']['imgs_2D'][1] - psr_imgs2d.mean) /
+ psr_imgs2d.std)
+ assert_array_equal(data['inputs']['imgs_3D'][0],
+ (raw_data['inputs']['imgs_3D'][0] - psr_imgs3d.mean) /
+ psr_imgs3d.std)
+ assert_array_equal(data['inputs']['imgs_3D'][1],
+ (raw_data['inputs']['imgs_3D'][1] - psr_imgs3d.mean) /
+ psr_imgs3d.std)
diff --git a/tests/models/heads/test_rgbpose_head.py b/tests/models/heads/test_rgbpose_head.py
new file mode 100644
index 0000000000..919e02a4bd
--- /dev/null
+++ b/tests/models/heads/test_rgbpose_head.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmaction.models import RGBPoseHead
+
+
+def test_rgbpose_head():
+ """Test RGBPoseHead."""
+ rgbpose_head = RGBPoseHead(
+ num_classes=4,
+ in_channels=[2048, 512],
+ dropout=dict(rgb=0.51, pose=0.49))
+ rgbpose_head.init_weights()
+
+ assert rgbpose_head.num_classes == 4
+ assert rgbpose_head.dropout == dict(rgb=0.51, pose=0.49)
+ assert rgbpose_head.in_channels == [2048, 512]
+ assert rgbpose_head.init_std == 0.01
+
+ assert isinstance(rgbpose_head.dropout_rgb, nn.Dropout)
+ assert isinstance(rgbpose_head.dropout_pose, nn.Dropout)
+ assert rgbpose_head.dropout_rgb.p == rgbpose_head.dropout['rgb']
+ assert rgbpose_head.dropout_pose.p == rgbpose_head.dropout['pose']
+
+ assert isinstance(rgbpose_head.fc_rgb, nn.Linear)
+ assert isinstance(rgbpose_head.fc_pose, nn.Linear)
+ assert rgbpose_head.fc_rgb.in_features == rgbpose_head.in_channels[0]
+ assert rgbpose_head.fc_rgb.out_features == rgbpose_head.num_classes
+ assert rgbpose_head.fc_pose.in_features == rgbpose_head.in_channels[1]
+ assert rgbpose_head.fc_pose.out_features == rgbpose_head.num_classes
+
+ assert isinstance(rgbpose_head.avg_pool, nn.AdaptiveAvgPool3d)
+ assert rgbpose_head.avg_pool.output_size == (1, 1, 1)
+
+ feat_rgb = torch.rand((2, 2048, 8, 7, 7))
+ feat_pose = torch.rand((2, 512, 32, 7, 7))
+
+ cls_scores = rgbpose_head((feat_rgb, feat_pose))
+ assert cls_scores['rgb'].shape == torch.Size([2, 4])
+ assert cls_scores['pose'].shape == torch.Size([2, 4])
diff --git a/tests/models/recognizers/test_recognizer2d.py b/tests/models/recognizers/test_recognizer2d.py
index 300e63b460..773bc0806f 100644
--- a/tests/models/recognizers/test_recognizer2d.py
+++ b/tests/models/recognizers/test_recognizer2d.py
@@ -1,4 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+import pytest
import torch
from mmaction.registry import MODELS
@@ -104,11 +107,20 @@ def test_tsn():
def test_tsm():
register_all_modules()
+ config = get_recognizer_cfg(
+ 'tsm/tsm_imagenet-pretrained-mobilenetv2_8xb16-1x1x8-50e_kinetics400-rgb.py' # noqa: E501
+ )
+ config.model['backbone']['pretrained'] = None
+
+ recognizer = MODELS.build(config.model)
+ recognizer.init_weights()
+
config = get_recognizer_cfg(
'tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py')
config.model['backbone']['pretrained'] = None
recognizer = MODELS.build(config.model)
+ recognizer.init_weights()
input_shape = (1, 8, 3, 32, 32)
demo_inputs = generate_recognizer_demo_inputs(input_shape)
@@ -182,6 +194,7 @@ def test_trn():
recognizer(one_img, gradcam=True)
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
def test_tpn():
register_all_modules()
config = get_recognizer_cfg(
@@ -190,7 +203,7 @@ def test_tpn():
recognizer = MODELS.build(config.model)
- input_shape = (1, 8, 3, 224, 224)
+ input_shape = (1, 8, 3, 32, 32)
demo_inputs = generate_recognizer_demo_inputs(input_shape)
imgs = demo_inputs['imgs']
diff --git a/tests/models/utils/test_gradcam.py b/tests/models/utils/test_gradcam.py
index d1a39ef87c..5fc4173be0 100644
--- a/tests/models/utils/test_gradcam.py
+++ b/tests/models/utils/test_gradcam.py
@@ -119,7 +119,7 @@ def test_r2plus1d():
recognizer = MODELS.build(config.model)
recognizer.cfg = config
- input_shape = (1, 3, 3, 8, 32, 32)
+ input_shape = (1, 3, 3, 8, 16, 16)
target_layer_name = 'backbone/layer4/1/relu'
_do_test_3D_models(recognizer, target_layer_name, input_shape)
@@ -167,7 +167,7 @@ def test_csn():
recognizer = MODELS.build(config.model)
recognizer.cfg = config
- input_shape = (1, 1, 3, 32, 32, 32)
+ input_shape = (1, 1, 3, 32, 16, 16)
target_layer_name = 'backbone/layer4/1/relu'
_do_test_3D_models(recognizer, target_layer_name, input_shape)
@@ -230,6 +230,6 @@ def test_x3d():
config.model['backbone']['pretrained'] = None
recognizer = MODELS.build(config.model)
recognizer.cfg = config
- input_shape = (1, 1, 3, 13, 32, 32)
+ input_shape = (1, 1, 3, 13, 16, 16)
target_layer_name = 'backbone/layer4/1/relu'
_do_test_3D_models(recognizer, target_layer_name, input_shape)
diff --git a/tests/utils/test_misc.py b/tests/utils/test_misc.py
new file mode 100644
index 0000000000..eeeba0d402
--- /dev/null
+++ b/tests/utils/test_misc.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import platform
+from tempfile import TemporaryDirectory
+
+import pytest
+
+from mmaction.utils import frame_extract
+
+
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
+def test_frame_extract():
+ data_prefix = osp.normpath(osp.join(osp.dirname(__file__), '../data'))
+ video_path = osp.join(data_prefix, 'test.mp4')
+ with TemporaryDirectory() as tmp_dir:
+ # assign short_side
+ frame_paths, frames = frame_extract(
+ video_path, short_side=100, out_dir=tmp_dir)
+ assert osp.exists(tmp_dir) and \
+ len(os.listdir(f'{tmp_dir}/test')) == len(frame_paths)
+ assert min(frames[0].shape[:2]) == 100
+ # default short_side
+ frame_paths, frames = frame_extract(video_path, out_dir=tmp_dir)
+ assert osp.exists(tmp_dir) and \
+ len(os.listdir(f'{tmp_dir}/test')) == len(frame_paths)
diff --git a/tests/visualization/test_action_visualizer.py b/tests/visualization/test_action_visualizer.py
index 3c7a1db59d..c86b324af9 100644
--- a/tests/visualization/test_action_visualizer.py
+++ b/tests/visualization/test_action_visualizer.py
@@ -1,5 +1,8 @@
# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
import decord
+import pytest
import torch
from mmengine.structures import LabelData
@@ -7,6 +10,7 @@
from mmaction.visualization import ActionVisualizer
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
def test_visualizer():
video = decord.VideoReader('./demo/demo.mp4')
video = video.get_batch(range(32)).asnumpy()
diff --git a/tests/visualization/test_video_backend.py b/tests/visualization/test_video_backend.py
index 0de82465ee..c5153d812d 100644
--- a/tests/visualization/test_video_backend.py
+++ b/tests/visualization/test_video_backend.py
@@ -1,11 +1,13 @@
# Copyright (c) OpenMMLab. All rights reserved.
import os
import os.path as osp
+import platform
import time
from pathlib import Path
from tempfile import TemporaryDirectory
import decord
+import pytest
import torch
from mmengine.structures import LabelData
@@ -16,6 +18,7 @@
register_all_modules()
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
def test_local_visbackend():
video = decord.VideoReader('./demo/demo.mp4')
video = video.get_batch(range(32)).asnumpy()
@@ -37,6 +40,7 @@ def test_local_visbackend():
return
+@pytest.mark.skipif(platform.system() == 'Windows', reason='Windows mem limit')
def test_tensorboard_visbackend():
video = decord.VideoReader('./demo/demo.mp4')
video = video.get_batch(range(32)).asnumpy()
diff --git a/tools/analysis_tools/confusion_matrix.py b/tools/analysis_tools/confusion_matrix.py
new file mode 100644
index 0000000000..224b8364bc
--- /dev/null
+++ b/tools/analysis_tools/confusion_matrix.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+
+import torch
+from mmengine import dump, list_from_file, load
+from mmengine.config import Config, DictAction
+from mmengine.evaluator import Evaluator
+from mmengine.runner import Runner
+
+from mmaction.evaluation import ConfusionMatrix
+from mmaction.registry import DATASETS
+from mmaction.utils import register_all_modules
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='Eval a checkpoint and draw the confusion matrix.')
+ parser.add_argument('config', help='test config file path')
+ parser.add_argument(
+ 'ckpt_or_result',
+ type=str,
+ help='The checkpoint file (.pth) or '
+ 'dumpped predictions pickle file (.pkl).')
+ parser.add_argument('--out', help='the file to save the confusion matrix.')
+ parser.add_argument(
+ '--show',
+ action='store_true',
+ help='whether to display the metric result by matplotlib if supports.')
+ parser.add_argument(
+ '--show-path', type=str, help='Path to save the visualization image.')
+ parser.add_argument(
+ '--include-values',
+ action='store_true',
+ help='To draw the values in the figure.')
+ parser.add_argument('--label-file', default=None, help='Labelmap file')
+ parser.add_argument(
+ '--target-classes',
+ type=int,
+ nargs='+',
+ default=[],
+ help='Selected classes to evaluate, and remains will be neglected')
+ parser.add_argument(
+ '--cmap',
+ type=str,
+ default='viridis',
+ help='The color map to use. Defaults to "viridis".')
+ parser.add_argument(
+ '--cfg-options',
+ nargs='+',
+ action=DictAction,
+ help='override some settings in the used config, the key-value pair '
+ 'in xxx=yyy format will be merged into config file. If the value to '
+ 'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+ 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+ 'Note that the quotation marks are necessary and that no white space '
+ 'is allowed.')
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_args()
+
+ # register all modules in mmaction into the registries
+ # do not init the default scope here because it will be init in the runner
+ register_all_modules(init_default_scope=False)
+
+ # load config
+ cfg = Config.fromfile(args.config)
+ if args.cfg_options is not None:
+ cfg.merge_from_dict(args.cfg_options)
+
+ if args.ckpt_or_result.endswith('.pth'):
+ # Set confusion matrix as the metric.
+ cfg.test_evaluator = dict(type='ConfusionMatrix')
+
+ cfg.load_from = str(args.ckpt_or_result)
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ cfg.work_dir = tmpdir
+ runner = Runner.from_cfg(cfg)
+ classes = runner.test_loop.dataloader.dataset.metainfo.get(
+ 'classes')
+ cm = runner.test()['confusion_matrix/result']
+ else:
+ predictions = load(args.ckpt_or_result)
+ evaluator = Evaluator(ConfusionMatrix())
+ metrics = evaluator.offline_evaluate(predictions, None)
+ cm = metrics['confusion_matrix/result']
+ try:
+ # Try to build the dataset.
+ dataset = DATASETS.build({
+ **cfg.test_dataloader.dataset, 'pipeline': []
+ })
+ classes = dataset.metainfo.get('classes')
+ except Exception:
+ classes = None
+
+ if args.label_file is not None:
+ classes = list_from_file(args.label_file)
+ if classes is None:
+ num_classes = cm.shape[0]
+ classes = list(range(num_classes))
+
+ if args.target_classes:
+ assert len(args.target_classes) > 1, \
+ 'please ensure select more than one class'
+ target_idx = torch.tensor(args.target_classes)
+ cm = cm[target_idx][:, target_idx]
+ classes = [classes[idx] for idx in target_idx]
+
+ if args.out is not None:
+ dump(cm, args.out)
+
+ if args.show or args.show_path is not None:
+ fig = ConfusionMatrix.plot(
+ cm,
+ show=args.show,
+ classes=classes,
+ include_values=args.include_values,
+ cmap=args.cmap)
+ if args.show_path is not None:
+ fig.savefig(args.show_path)
+ print(f'The confusion matrix is saved at {args.show_path}.')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py
index b89f5db5ad..fbec21887f 100644
--- a/tools/analysis_tools/get_flops.py
+++ b/tools/analysis_tools/get_flops.py
@@ -1,21 +1,16 @@
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
-import torch
-
-try:
- from fvcore.nn import (ActivationCountAnalysis, FlopCountAnalysis,
- flop_count_str, flop_count_table, parameter_count)
-except ImportError:
- print('You may need to install fvcore for flops computation, '
- 'and you can use `pip install -r requirements/optional.txt` '
- 'to set up the environment')
-from fvcore.nn.print_model_statistics import _format_size
from mmengine import Config
from mmengine.registry import init_default_scope
from mmaction.registry import MODELS
+try:
+ from mmengine.analysis import get_model_complexity_info
+except ImportError:
+ raise ImportError('Please upgrade mmcv to >0.6.2')
+
def parse_args():
parser = argparse.ArgumentParser(description='Get model flops and params')
@@ -39,17 +34,17 @@ def main():
elif len(args.shape) == 2:
input_shape = (1, 3) + tuple(args.shape)
elif len(args.shape) == 4:
- # n, c, h, w = args.shape
+ # n, c, h, w = args.shape for 2D recognizer
input_shape = tuple(args.shape)
elif len(args.shape) == 5:
- # n, c, t, h, w = args.shape
+ # n, c, t, h, w = args.shape for 3D recognizer or
+ # n, m, t, v, c = args.shape for GCN-based recognizer
input_shape = tuple(args.shape)
else:
raise ValueError('invalid input shape')
cfg = Config.fromfile(args.config)
init_default_scope(cfg.get('default_scope', 'mmaction'))
-
model = MODELS.build(cfg.model)
model.eval()
@@ -60,28 +55,14 @@ def main():
'FLOPs counter is currently not currently supported with {}'.
format(model.__class__.__name__))
- inputs = (torch.randn((1, *input_shape)), )
- flops_ = FlopCountAnalysis(model, inputs)
- activations_ = ActivationCountAnalysis(model, inputs)
-
- flops = _format_size(flops_.total())
- activations = _format_size(activations_.total())
- params = _format_size(parameter_count(model)[''])
-
- flop_table = flop_count_table(
- flops=flops_,
- activations=activations_,
- show_param_shapes=True,
- )
- flop_str = flop_count_str(flops=flops_, activations=activations_)
-
- print('\n' + flop_str)
- print('\n' + flop_table)
-
+ analysis_results = get_model_complexity_info(model, input_shape)
+ flops = analysis_results['flops_str']
+ params = analysis_results['params_str']
+ table = analysis_results['out_table']
+ print(table)
split_line = '=' * 30
- print(f'{split_line}\nInput shape: {input_shape}\n'
- f'Flops: {flops}\nParams: {params}\n'
- f'Activation: {activations}\n{split_line}')
+ print(f'\n{split_line}\nInput shape: {input_shape}\n'
+ f'Flops: {flops}\nParams: {params}\n{split_line}')
print('!!!Please be cautious if you use the results in papers. '
'You may need to check if all ops are supported and verify that the '
'flops computation is correct.')
diff --git a/tools/data/activitynet/process_annotations.py b/tools/data/activitynet/process_annotations.py
index 09ed5b5c8f..9374281a64 100644
--- a/tools/data/activitynet/process_annotations.py
+++ b/tools/data/activitynet/process_annotations.py
@@ -18,7 +18,7 @@ def load_json(file):
anno_database = load_json(ann_file)
-video_record = np.loadtxt(info_file, dtype=np.str, delimiter=',', skiprows=1)
+video_record = np.loadtxt(info_file, dtype=str, delimiter=',', skiprows=1)
video_dict_train = {}
video_dict_val = {}
@@ -29,8 +29,8 @@ def load_json(file):
video_name = video_item[0]
video_info = anno_database[video_name]
video_subset = video_item[5]
- video_info['fps'] = video_item[3].astype(np.float)
- video_info['rfps'] = video_item[4].astype(np.float)
+ video_info['fps'] = video_item[3].astype(np.float64)
+ video_info['rfps'] = video_item[4].astype(np.float64)
video_dict_full[video_name] = video_info
if video_subset == 'training':
video_dict_train[video_name] = video_info
diff --git a/tools/data/anno_txt2json.py b/tools/data/anno_txt2json.py
index fcefc7778e..f5b1f9f736 100644
--- a/tools/data/anno_txt2json.py
+++ b/tools/data/anno_txt2json.py
@@ -1,7 +1,7 @@
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
-import mmcv
+import mmengine
def parse_args():
@@ -100,4 +100,4 @@ def lines2dictlist(lines, format):
result = lines2dictlist(lines, args.format)
if args.output is None:
args.output = args.annofile.replace('.txt', '.json')
- mmcv.dump(result, args.output)
+ mmengine.dump(result, args.output)
diff --git a/tools/data/build_audio_features.py b/tools/data/build_audio_features.py
index 05f5978083..28356a0e64 100644
--- a/tools/data/build_audio_features.py
+++ b/tools/data/build_audio_features.py
@@ -6,7 +6,7 @@
import sys
from multiprocessing import Pool
-import mmcv
+import mmengine
import numpy as np
from scipy.io import wavfile
@@ -295,7 +295,7 @@ def extract_audio_feature(wav_path, audio_tools, mel_out_dir):
parser.add_argument('--part', type=str, default='1/1')
args = parser.parse_args()
- mmcv.mkdir_or_exist(args.spectrogram_save_path)
+ mmengine.mkdir_or_exist(args.spectrogram_save_path)
files = glob.glob(
osp.join(args.audio_home_path, '*/' * args.level, '*' + args.ext))
diff --git a/tools/data/build_file_list.py b/tools/data/build_file_list.py
index 0ba15e75d0..11a1322854 100644
--- a/tools/data/build_file_list.py
+++ b/tools/data/build_file_list.py
@@ -5,7 +5,7 @@
import os.path as osp
import random
-from mmcv.runner import set_random_seed
+from mmengine.runner import set_random_seed
from tools.data.anno_txt2json import lines2dictlist
from tools.data.parse_file_list import (parse_directory, parse_diving48_splits,
diff --git a/tools/data/extract_audio.py b/tools/data/extract_audio.py
index 6f56de2691..78d95d8ea1 100644
--- a/tools/data/extract_audio.py
+++ b/tools/data/extract_audio.py
@@ -5,7 +5,7 @@
import os.path as osp
from multiprocessing import Pool
-import mmcv
+import mmengine
def extract_audio_wav(line):
@@ -47,7 +47,7 @@ def parse_args():
if __name__ == '__main__':
args = parse_args()
- mmcv.mkdir_or_exist(args.dst_root)
+ mmengine.mkdir_or_exist(args.dst_root)
print('Reading videos from folder: ', args.root)
print('Extension of videos: ', args.ext)
diff --git a/tools/data/kinetics/README.md b/tools/data/kinetics/README.md
index 4fc7b6bb1e..0df8f8634f 100644
--- a/tools/data/kinetics/README.md
+++ b/tools/data/kinetics/README.md
@@ -24,6 +24,8 @@ Because of the expirations of some YouTube links, the sizes of kinetics dataset
| Dataset | training videos | validation videos |
| :---------: | :-------------: | :---------------: |
| kinetics400 | 240436 | 19796 |
+| Kinetics600 | 383393 | 27910 |
+| Kinetics700 | 542357 | 34824 |
:::
@@ -46,7 +48,16 @@ bash download_backup_annotations.sh ${DATASET}
## Step 2. Prepare Videos
-Then, you can run the following script to prepare videos.
+### Option 1: Download from OpenDataLab
+
+**Recommend**: [OpenDataLab](https://opendatalab.com/) provides the Kinetics dataset ([Kinetics400](https://opendatalab.com/Kinetics-400), [Kinetics600](https://opendatalab.com/Kinetics600), [Kinetics700](https://opendatalab.com/Kinetics_700)), users can download Kinetics dataset with short edge 320 pixels from here.
+
+:::{note}
+All experiments on Kinetics in MMAction2 are based on this version, we recommend users to try this version.
+
+### Option 2: Download from Other Source
+
+you can run the following script to prepare videos.
The codes are adapted from the [official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). Note that this might take a long time.
```shell
diff --git a/tools/data/kinetics/README_zh-CN.md b/tools/data/kinetics/README_zh-CN.md
index e307b9e7f5..86cb65239e 100644
--- a/tools/data/kinetics/README_zh-CN.md
+++ b/tools/data/kinetics/README_zh-CN.md
@@ -18,11 +18,14 @@
请参照 [官方网站](https://deepmind.com/research/open-source/open-source-datasets/kinetics/) 以获取数据集基本信息。此脚本用于准备数据集 kinetics400,kinetics600,kinetics700。为准备 kinetics 数据集的不同版本,用户需将脚本中的 `${DATASET}` 赋值为数据集对应版本名称,可选项为 `kinetics400`,`kinetics600`, `kinetics700`。
在开始之前,用户需确保当前目录为 `$MMACTION2/tools/data/${DATASET}/`。
-**注**:由于部分 YouTube 链接失效,爬取的 Kinetics 数据集大小可能与原版不同。以下是我们所使用 Kinetics 数据集的大小:
+:::{note}
+由于部分 YouTube 链接失效,爬取的 Kinetics 数据集大小可能与原版不同。以下是我们所使用 Kinetics 数据集的大小:
| 数据集 | 训练视频 | 验证集视频 |
| :---------: | :------: | :--------: |
-| kinetics400 | 240436 | 19796 |
+| Kinetics400 | 240436 | 19796 |
+| Kinetics600 | 383393 | 27910 |
+| Kinetics700 | 542357 | 34824 |
## 1. 准备标注文件
@@ -42,6 +45,15 @@ bash download_backup_annotations.sh ${DATASET}
## 2. 准备视频
+### 选项 1: 从 OpenDataLab 下载
+
+**推荐**:[OpenDataLab](https://opendatalab.com/) 提供了 Kinetics 数据集 ([Kinetics400](https://opendatalab.com/Kinetics-400), [Kinetics600](https://opendatalab.com/Kinetics600), [Kinetics700](https://opendatalab.com/Kinetics_700)), 用户可以从这里下载短边长度为 320 的 Kinetics 数据集。
+
+:::{note}
+MMAction2 代码仓库中提供的 Kinetics 实验性能,都是基于这个版本的数据得到的。我们建议用户使用这个版本的 Kinetics 数据集进行实验。
+
+### 选项 2:从其他数据源下载
+
用户可以使用以下脚本准备视频,视频准备代码修改自 [官方爬虫](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics)。注意这一步骤将花费较长时间。
```shell
diff --git a/tools/data/skeleton/README.md b/tools/data/skeleton/README.md
index 25c7f62892..10244d23a1 100644
--- a/tools/data/skeleton/README.md
+++ b/tools/data/skeleton/README.md
@@ -15,48 +15,46 @@
## Introduction
-We release the skeleton annotations used in [Revisiting Skeleton-based Action Recognition](https://arxiv.org/abs/2104.13586). By default, we use [Faster-RCNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) with ResNet50 backbone for human detection and [HRNet-w32](https://github.com/open-mmlab/mmpose/blob/master/configs/top_down/hrnet/coco/hrnet_w32_coco_256x192.py) for single person pose estimation. For FineGYM, we use Ground-Truth bounding boxes for the athlete instead of detection bounding boxes. Currently, we release the skeleton annotations for FineGYM and NTURGB-D Xsub split. Other annotations will be soo released.
+We release the skeleton annotations used in [Revisiting Skeleton-based Action Recognition](https://arxiv.org/abs/2104.13586). By default, we use [Faster-RCNN](https://github.com/open-mmlab/mmdetection/blob/master/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py) with ResNet50 backbone for human detection and [HRNet-w32](https://github.com/open-mmlab/mmpose/blob/master/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py) for single person pose estimation. For FineGYM, we use Ground-Truth bounding boxes for the athlete instead of detection bounding boxes.
## Prepare Annotations
-Currently, we support HMDB51, UCF101, FineGYM and NTURGB+D. For FineGYM, you can execute following scripts to prepare the annotations.
+We provide links to the pre-processed skeleton annotations, you can directly download them and use them for training & testing.
-```shell
-bash download_annotations.sh ${DATASET}
-```
-
-Due to [Conditions of Use](http://rose1.ntu.edu.sg/Datasets/actionRecognition.asp) of the NTURGB+D dataset, we can not directly release the annotations used in our experiments. So that we provide a script to generate pose annotations for videos in NTURGB+D datasets, which generate a dictionary and save it as a single pickle file. You can create a list which contain all annotation dictionaries of corresponding videos and save them as a pickle file. Then you can get the `ntu60_xsub_train.pkl`, `ntu60_xsub_val.pkl`, `ntu120_xsub_train.pkl`, `ntu120_xsub_val.pkl` that we used in training.
+- NTURGB+D \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_2d.pkl
+- NTURGB+D \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_3d.pkl
+- NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl
+- NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl
+- GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl
+ - GYM 2D skeletons are extracted with ground-truth human bounding boxes, which can be downloaded with [link](https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl). Please cite [PoseConv3D](https://arxiv.org/abs/2104.13586) if you use it in your project.
+- UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl
+- HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl
+- Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl
+- Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (Table of contents only, no skeleton annotations)
-For those who have not enough computations for pose extraction, we provide the outputs of the above pipeline here, corresponding to 4 different splits of NTURGB+D datasets:
+For Kinetics400, since the skeleton annotations are large, we do not provide the direct download links on aliyun. Please use the following link to download the `kpfiles` and extract it under `$MMACTION2/data/k400` for Kinetics400 training & testing: https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EeyDCVskqLtClMVVwqD53acBF2FEwkctp3vtRbkLfnKSTw?e=B3SZlM
-- ntu60_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_train.pkl
-- ntu60_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_val.pkl
-- ntu120_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_train.pkl
-- ntu120_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_val.pkl
-- hmdb51: https://download.openmmlab.com/mmaction/posec3d/hmdb51.pkl
-- ucf101: https://download.openmmlab.com/mmaction/posec3d/ucf101.pkl
-
-To generate 2D pose annotations for a single video, first, you need to install mmdetection and mmpose from src code. After that, you need to replace the placeholder `mmdet_root` and `mmpose_root` in `ntu_pose_extraction.py` with your installation path. Then you can use following scripts for NTURGB+D video pose extraction:
+If you want to generate 2D skeleton annotations of specified video, please install mmdetection and mmpose first, then use the following script to extract skeleton annotations of NTURGB+D video:
```python
python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl
```
-After you get pose annotations for all videos in a dataset split, like `ntu60_xsub_val`. You can gather them into a single list and save the list as `ntu60_xsub_val.pkl`. You can use those larger pickle files for training and testing.
-
-## The Format of PoseC3D Annotations
+please note that, due to the upgrade of mmpose, the inference results may have slight differences from the provided skeleton annotations.
-Here we briefly introduce the format of PoseC3D Annotations, we will take `gym_train.pkl` as an example: the content of `gym_train.pkl` is a list of length 20484, each item is a dictionary that is the skeleton annotation of one video. Each dictionary has following fields:
+## The Format of Annotations
-- keypoint: The keypoint coordinates, which is a numpy array of the shape N (#person) x T (temporal length) x K (#keypoints, 17 in our case) x 2 (x, y coordinate).
-- keypoint_score: The keypoint confidence scores, which is a numpy array of the shape N (#person) x T (temporal length) x K (#keypoints, 17 in our case).
-- frame_dir: The corresponding video name.
-- label: The action category.
-- img_shape: The image shape of each frame.
-- original_shape: Same as above.
-- total_frames: The temporal length of the video.
+Each pickle file corresponds to an action recognition dataset. The content of a pickle file is a dictionary with two fields: `split` and `annotations`
-For training with your custom dataset, you can refer to [Custom Dataset Training](https://github.com/open-mmlab/mmaction2/blob/master/configs/skeleton/posec3d/custom_dataset_training.md).
+1. Split: The value of the `split` field is a dictionary: the keys are the split names, while the values are lists of video identifiers that belong to the specific clip.
+2. Annotations: The value of the `annotations` field is a list of skeleton annotations, each skeleton annotation is a dictionary, containing the following fields:
+ 1. `frame_dir` (str): The identifier of the corresponding video.
+ 2. `total_frames` (int): The number of frames in this video.
+ 3. `img_shape` (tuple\[int\]): The shape of a video frame, a tuple with two elements, in the format of (height, width). Only required for 2D skeletons.
+ 4. `original_shape` (tuple\[int\]): Same as `img_shape`.
+ 5. `label` (int): The action label.
+ 6. `keypoint` (np.ndarray, with shape \[M x T x V x C\]): The keypoint annotation. M: number of persons; T: number of frames (same as `total_frames`); V: number of keypoints (25 for NTURGB+D 3D skeleton, 17 for CoCo, 18 for OpenPose, etc. ); C: number of dimensions for keypoint coordinates (C=2 for 2D keypoint, C=3 for 3D keypoint).
+ 7. `keypoint_score` (np.ndarray, with shape \[M x T x V\]): The confidence score of keypoints. Only required for 2D skeletons.
## Visualization
@@ -128,4 +126,4 @@ We provide scripts to convert skeleton annotations from third-party projects to
- [x] NTU120_XSet
- [x] UCF101
- [x] HMDB51
-- [ ] Kinetics
+- [x] Kinetics
diff --git a/tools/data/skeleton/README_zh-CN.md b/tools/data/skeleton/README_zh-CN.md
index fb6de5925a..3754175908 100644
--- a/tools/data/skeleton/README_zh-CN.md
+++ b/tools/data/skeleton/README_zh-CN.md
@@ -33,20 +33,27 @@ bash download_annotations.sh ${DATASET}
对于无法进行姿态提取的用户,这里提供了上述流程的输出结果,分别对应 NTURGB-D 数据集的 4 个部分:
-- ntu60_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_train.pkl
-- ntu60_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_val.pkl
-- ntu120_xsub_train: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_train.pkl
-- ntu120_xsub_val: https://download.openmmlab.com/mmaction/posec3d/ntu120_xsub_val.pkl
-- hmdb51: https://download.openmmlab.com/mmaction/posec3d/hmdb51.pkl
-- ucf101: https://download.openmmlab.com/mmaction/posec3d/ucf101.pkl
+- NTURGB+D \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_2d.pkl
+- NTURGB+D \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu60_3d.pkl
+- NTURGB+D 120 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_2d.pkl
+- NTURGB+D 120 \[3D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ntu120_3d.pkl
+- GYM \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/gym_2d.pkl
+ - GYM 2D 姿态标注文件是基于运动员的真实标注框生成的,用户可以从这个[链接](https://download.openmmlab.com/mmaction/pyskl/data/gym/gym_gt_bboxes.pkl)下载真实标注框。如果你在项目中使用了该数据,请引用 [PoseConv3D](https://arxiv.org/abs/2104.13586)
+- UCF101 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/ucf101_2d.pkl
+- HMDB51 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/hmdb51_2d.pkl
+- Diving48 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/diving48_2d.pkl
+- Kinetics400 \[2D Skeleton\]: https://download.openmmlab.com/mmaction/v1.0/skeleton/data/k400_2d.pkl (只包含数据列表,没有姿态标注文件)
-若想生成单个视频的 2D 姿态标注文件,首先,用户需要由源码安装 mmdetection 和 mmpose。之后,用户需要在 `ntu_pose_extraction.py` 中指定 `mmdet_root` 和 `mmpose_root` 变量。
-最后,用户可使用以下脚本进行 NTURGB+D 视频的姿态提取:
+由于 Kinetics400 数据集姿态标注文件过大,我们不提供阿里云的下载链接,请使用此[链接](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155136485_link_cuhk_edu_hk/EeyDCVskqLtClMVVwqD53acBF2FEwkctp3vtRbkLfnKSTw?e=B3SZlM)下载 `kpfiles`,解压到 `$MMACTION2/data/k400` 目录下,用于 Kinetics400 的训练和测试。
+
+若想生成单个视频的 2D 姿态标注文件,用户在安装 mmdetection 和 mmpose 之后,可使用以下脚本进行 NTURGB+D 视频的姿态提取:
```python
python ntu_pose_extraction.py S001C001P001R001A001_rgb.avi S001C001P001R001A001.pkl
```
+请注意,由于 mmpose 算法库升级,此脚本的推理结果与提供的姿态点数据集可能略有差异。
+
在用户获得数据集某部分所有视频的姿态标注文件(如 `ntu60_xsub_val`)后,可以将其集合成一个 list 数据并保存为 `ntu60_xsub_val.pkl`。用户可用这些大型 pickle 文件进行训练和测试。
## PoseC3D 的标注文件格式
diff --git a/tools/data/skeleton/compress_nturgbd.py b/tools/data/skeleton/compress_nturgbd.py
new file mode 100644
index 0000000000..b8639257c9
--- /dev/null
+++ b/tools/data/skeleton/compress_nturgbd.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import multiprocessing as mp
+import os
+import os.path as osp
+import subprocess
+
+
+def get_shape(vid):
+ cmd = 'ffprobe -v error -select_streams v:0 -show_entries ' \
+ 'stream=width,height -of csv=s=x:p=0 \"{}\"'.format(vid)
+ w, h = subprocess.check_output(cmd, shell=True).decode('utf-8').split('x')
+ return int(w), int(h)
+
+
+def compress(src, dest, shape=None, target_size=540, fps=-1):
+ if shape is None:
+ shape = get_shape(src)
+ w, h = shape
+ scale_str = f'-vf scale=-2:{target_size}' if w >= h else \
+ f'-vf scale={target_size}:-2'
+ fps_str = f'-r {fps}' if fps > 0 else ''
+ quality_str = '-q:v 1'
+ vcodec_str = '-c:v libx264'
+ cmd = f'ffmpeg -y -loglevel error -i {src} -threads 1 ' \
+ f'{quality_str} {scale_str} {fps_str} {vcodec_str} {dest}'
+ os.system(cmd)
+
+
+def compress_nturgbd(name):
+ src = name
+ dest = src.replace('nturgbd_raw',
+ 'nturgbd_videos').replace('_rgb.avi', '.mp4')
+ shape = (1920, 1080)
+ compress(src, dest, shape)
+
+
+src_dir = 'data/nturgbd_raw'
+tgt_dir = 'data/nturgbd_videos'
+os.makedirs(tgt_dir, exist_ok=True)
+files = [osp.join(src_dir, x) for x in os.listdir(src_dir) if '.avi' in x]
+pool = mp.Pool(32)
+pool.map(compress_nturgbd, files)
diff --git a/tools/data/skeleton/download_annotations.sh b/tools/data/skeleton/download_annotations.sh
deleted file mode 100644
index d57efbceac..0000000000
--- a/tools/data/skeleton/download_annotations.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-DATASET=$1
-if [ "$DATASET" == "gym" ]; then
- echo "We are processing $DATASET"
-else
- echo "Bad Argument, we only support gym now."
- exit 0
-fi
-
-DATA_DIR="../../../data/posec3d/"
-
-if [[ ! -d "${DATA_DIR}" ]]; then
- echo "${DATA_DIR} does not exist. Creating";
- mkdir -p ${DATA_DIR}
-fi
-
-wget https://download.openmmlab.com/mmaction/posec3d/${DATASET}_train.pkl
-wget https://download.openmmlab.com/mmaction/posec3d/${DATASET}_val.pkl
-
-mv ${DATASET}_train.pkl ${DATA_DIR}
-mv ${DATASET}_val.pkl ${DATA_DIR}
diff --git a/tools/data/skeleton/ntu_pose_extraction.py b/tools/data/skeleton/ntu_pose_extraction.py
index 17af16e749..d60fefdd97 100644
--- a/tools/data/skeleton/ntu_pose_extraction.py
+++ b/tools/data/skeleton/ntu_pose_extraction.py
@@ -1,82 +1,24 @@
# Copyright (c) OpenMMLab. All rights reserved.
import abc
import argparse
-import os
import os.path as osp
-import random as rd
-import shutil
-import string
from collections import defaultdict
+from tempfile import TemporaryDirectory
-import cv2
-import mmcv
+import mmengine
import numpy as np
-try:
- from mmdet.apis import inference_detector, init_detector
-except (ImportError, ModuleNotFoundError):
- raise ImportError('Failed to import `inference_detector` and '
- '`init_detector` form `mmdet.apis`. These apis are '
- 'required in this script! ')
-
-try:
- from mmpose.apis import inference_top_down_pose_model, init_pose_model
-except (ImportError, ModuleNotFoundError):
- raise ImportError('Failed to import `inference_top_down_pose_model` and '
- '`init_pose_model` form `mmpose.apis`. These apis are '
- 'required in this script! ')
-
-mmdet_root = ''
-mmpose_root = ''
+from mmaction.apis import detection_inference, pose_inference
+from mmaction.utils import frame_extract
args = abc.abstractproperty()
-args.det_config = f'{mmdet_root}/configs/faster_rcnn/faster_rcnn_r50_caffe_fpn_mstrain_1x_coco-person.py' # noqa: E501
+args.det_config = 'demo/demo_configs/faster-rcnn_r50-caffe_fpn_ms-1x_coco-person.py' # noqa: E501
args.det_checkpoint = 'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco-person/faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth' # noqa: E501
args.det_score_thr = 0.5
-args.pose_config = f'{mmpose_root}/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_w32_coco_256x192.py' # noqa: E501
+args.pose_config = 'demo/demo_configs/td-hm_hrnet-w32_8xb64-210e_coco-256x192_infer.py' # noqa: E501
args.pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w32_coco_256x192-c78dce93_20200708.pth' # noqa: E501
-def gen_id(size=8):
- chars = string.ascii_uppercase + string.digits
- return ''.join(rd.choice(chars) for _ in range(size))
-
-
-def extract_frame(video_path):
- dname = gen_id()
- os.makedirs(dname, exist_ok=True)
- frame_tmpl = osp.join(dname, 'img_{:05d}.jpg')
- vid = cv2.VideoCapture(video_path)
- frame_paths = []
- flag, frame = vid.read()
- cnt = 0
- while flag:
- frame_path = frame_tmpl.format(cnt + 1)
- frame_paths.append(frame_path)
-
- cv2.imwrite(frame_path, frame)
- cnt += 1
- flag, frame = vid.read()
-
- return frame_paths
-
-
-def detection_inference(args, frame_paths):
- model = init_detector(args.det_config, args.det_checkpoint, args.device)
- assert model.CLASSES[0] == 'person', ('We require you to use a detector '
- 'trained on COCO')
- results = []
- print('Performing Human Detection for each frame')
- prog_bar = mmcv.ProgressBar(len(frame_paths))
- for frame_path in frame_paths:
- result = inference_detector(model, frame_path)
- # We only keep human detections with score larger than det_score_thr
- result = result[0][result[0][:, 4] >= args.det_score_thr]
- results.append(result)
- prog_bar.update()
- return results
-
-
def intersection(b0, b1):
l, r = max(b0[0], b1[0]), min(b0[2], b1[2])
u, d = max(b0[1], b1[1]), min(b0[3], b1[3])
@@ -227,7 +169,7 @@ def tracklets2bbox(tracklet, num_frame):
mind = np.abs(k - idx)
mink = k
bbox[idx] = bboxd[mink]
- return bad, bbox
+ return bad, bbox[:, None, :]
def bboxes2bbox(bbox, num_frame):
@@ -287,41 +229,34 @@ def ntu_det_postproc(vid, det_results):
return bboxes2bbox(det_results, len(det_results))
-def pose_inference(args, frame_paths, det_results):
- model = init_pose_model(args.pose_config, args.pose_checkpoint,
- args.device)
- print('Performing Human Pose Estimation for each frame')
- prog_bar = mmcv.ProgressBar(len(frame_paths))
-
- num_frame = len(det_results)
- num_person = max([len(x) for x in det_results])
- kp = np.zeros((num_person, num_frame, 17, 3), dtype=np.float32)
-
- for i, (f, d) in enumerate(zip(frame_paths, det_results)):
- # Align input format
- d = [dict(bbox=x) for x in list(d) if x[-1] > 0.5]
- pose = inference_top_down_pose_model(model, f, d, format='xyxy')[0]
- for j, item in enumerate(pose):
- kp[j, i] = item['keypoints']
- prog_bar.update()
- return kp
-
-
def ntu_pose_extraction(vid, skip_postproc=False):
- frame_paths = extract_frame(vid)
- det_results = detection_inference(args, frame_paths)
+ tmp_dir = TemporaryDirectory()
+ frame_paths, _ = frame_extract(vid, out_dir=tmp_dir.name)
+ det_results, _ = detection_inference(
+ args.det_config,
+ args.det_checkpoint,
+ frame_paths,
+ args.det_score_thr,
+ device=args.device,
+ with_score=True)
+
if not skip_postproc:
det_results = ntu_det_postproc(vid, det_results)
- pose_results = pose_inference(args, frame_paths, det_results)
+ pose_results, _ = pose_inference(args.pose_config, args.pose_checkpoint,
+ frame_paths, det_results, args.device)
+
anno = dict()
- anno['keypoint'] = pose_results[..., :2]
- anno['keypoint_score'] = pose_results[..., 2]
+ anno['keypoint'] = np.stack(
+ [pose['keypoints'].astype(np.float32) for pose in pose_results],
+ axis=1)
+ anno['keypoint_score'] = np.stack(
+ [pose['keypoint_scores'] for pose in pose_results], axis=1)
anno['frame_dir'] = osp.splitext(osp.basename(vid))[0]
anno['img_shape'] = (1080, 1920)
anno['original_shape'] = (1080, 1920)
- anno['total_frames'] = pose_results.shape[1]
+ anno['total_frames'] = len(pose_results)
anno['label'] = int(osp.basename(vid).split('A')[1][:3]) - 1
- shutil.rmtree(osp.dirname(frame_paths[0]))
+ tmp_dir.cleanup()
return anno
@@ -344,4 +279,4 @@ def parse_args():
args.output = global_args.output
args.skip_postproc = global_args.skip_postproc
anno = ntu_pose_extraction(args.video, args.skip_postproc)
- mmcv.dump(anno, args.output)
+ mmengine.dump(anno, args.output)
diff --git a/tools/deployment/export_onnx_stdet.py b/tools/deployment/export_onnx_stdet.py
index fc587dbff0..ba0cd2e388 100644
--- a/tools/deployment/export_onnx_stdet.py
+++ b/tools/deployment/export_onnx_stdet.py
@@ -155,9 +155,9 @@ def main():
args.output_file,
input_names=['input_tensor', 'rois'],
output_names=['cls_score'],
- export_params=False,
+ export_params=True,
do_constant_folding=True,
- verbose=True,
+ verbose=False,
opset_version=11,
dynamic_axes={
'input_tensor': {
diff --git a/tools/misc/clip_feature_extraction.py b/tools/misc/clip_feature_extraction.py
index 1829bf9b5c..a7a3e67635 100644
--- a/tools/misc/clip_feature_extraction.py
+++ b/tools/misc/clip_feature_extraction.py
@@ -59,7 +59,10 @@ def parse_args():
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
- parser.add_argument('--local_rank', type=int, default=0)
+ # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+ # will pass the `--local-rank` parameter to `tools/train.py` instead
+ # of `--local_rank`.
+ parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
diff --git a/tools/test.py b/tools/test.py
index 0d0d4bd20f..4f310fa9e0 100644
--- a/tools/test.py
+++ b/tools/test.py
@@ -51,7 +51,7 @@ def parse_args():
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
- parser.add_argument('--local_rank', type=int, default=0)
+ parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
diff --git a/tools/train.py b/tools/train.py
index 2c51c50709..e43078ddb8 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -56,7 +56,7 @@ def parse_args():
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
- parser.add_argument('--local_rank', type=int, default=0)
+ parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
diff --git a/tools/visualizations/browse_dataset.py b/tools/visualizations/browse_dataset.py
index e6cf9b82c4..6fb720521e 100644
--- a/tools/visualizations/browse_dataset.py
+++ b/tools/visualizations/browse_dataset.py
@@ -21,13 +21,9 @@
def parse_args():
parser = argparse.ArgumentParser(description='Browse a dataset')
parser.add_argument('config', help='train config file path')
- parser.add_argument('--label', default=None, type=str, help='label file')
parser.add_argument(
- '--output-dir',
- '-o',
- default=None,
- type=str,
- help='If there is no display interface, you can save it.')
+ 'output_dir', default=None, type=str, help='output directory')
+ parser.add_argument('--label', default=None, type=str, help='label file')
parser.add_argument(
'--phase',
'-p',
diff --git a/tools/visualizations/vis_scheduler.py b/tools/visualizations/vis_scheduler.py
index 6e1b744862..17daa34e6b 100644
--- a/tools/visualizations/vis_scheduler.py
+++ b/tools/visualizations/vis_scheduler.py
@@ -16,58 +16,7 @@
from mmengine.runner import Runner
from mmengine.visualization import Visualizer
from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn
-
-
-class SimpleModel(BaseModel):
- """simple model that do nothing in train_step."""
-
- def __init__(self):
- super(SimpleModel, self).__init__()
- self.data_preprocessor = nn.Identity()
- self.conv = nn.Conv2d(1, 1, 1)
-
- def forward(self, inputs, data_samples, mode='tensor'):
- pass
-
- def train_step(self, data, optim_wrapper):
- pass
-
-
-class ParamRecordHook(Hook):
-
- def __init__(self, by_epoch):
- super().__init__()
- self.by_epoch = by_epoch
- self.lr_list = []
- self.momentum_list = []
- self.task_id = 0
- self.progress = Progress(BarColumn(), MofNCompleteColumn(),
- TextColumn('{task.description}'))
-
- def before_train(self, runner):
- if self.by_epoch:
- total = runner.train_loop.max_epochs
- self.task_id = self.progress.add_task(
- 'epochs', start=True, total=total)
- else:
- total = runner.train_loop.max_iters
- self.task_id = self.progress.add_task(
- 'iters', start=True, total=total)
- self.progress.start()
-
- def after_train_epoch(self, runner):
- if self.by_epoch:
- self.progress.update(self.task_id, advance=1)
-
- def after_train_iter(self, runner, batch_idx, data_batch, outputs):
- if not self.by_epoch:
- self.progress.update(self.task_id, advance=1)
- self.lr_list.append(runner.optim_wrapper.get_lr()['lr'][0])
- self.momentum_list.append(
- runner.optim_wrapper.get_momentum()['momentum'][0])
-
- def after_train(self, runner):
- self.progress.stop()
+from torch.utils.data import DataLoader
def parse_args():
@@ -130,6 +79,58 @@ def parse_args():
return args
+class SimpleModel(BaseModel):
+ """simple model that do nothing in train_step."""
+
+ def __init__(self):
+ super(SimpleModel, self).__init__()
+ self.data_preprocessor = nn.Identity()
+ self.conv = nn.Conv2d(1, 1, 1)
+
+ def forward(self, inputs, data_samples, mode='tensor'):
+ pass
+
+ def train_step(self, data, optim_wrapper):
+ pass
+
+
+class ParamRecordHook(Hook):
+
+ def __init__(self, by_epoch):
+ super().__init__()
+ self.by_epoch = by_epoch
+ self.lr_list = []
+ self.momentum_list = []
+ self.task_id = 0
+ self.progress = Progress(BarColumn(), MofNCompleteColumn(),
+ TextColumn('{task.description}'))
+
+ def before_train(self, runner):
+ if self.by_epoch:
+ total = runner.train_loop.max_epochs
+ self.task_id = self.progress.add_task(
+ 'epochs', start=True, total=total)
+ else:
+ total = runner.train_loop.max_iters
+ self.task_id = self.progress.add_task(
+ 'iters', start=True, total=total)
+ self.progress.start()
+
+ def after_train_epoch(self, runner):
+ if self.by_epoch:
+ self.progress.update(self.task_id, advance=1)
+
+ def after_train_iter(self, runner, batch_idx, data_batch, outputs):
+ if not self.by_epoch:
+ self.progress.update(self.task_id, advance=1)
+ self.lr_list.append(runner.optim_wrapper.get_lr()['lr'][0])
+ self.momentum_list.append(
+ runner.optim_wrapper.get_momentum()['momentum'][0])
+
+ def after_train(self, runner):
+ self.progress.stop()
+
+
def plot_curve(lr_list, args, param_name, iters_per_epoch, by_epoch=True):
"""Plot learning rate vs iter graph."""
try:
@@ -186,6 +187,7 @@ def simulate_train(data_loader, cfg, by_epoch):
param_scheduler=cfg.param_scheduler,
default_scope=cfg.default_scope,
default_hooks=default_hooks,
+ auto_scale_lr=cfg.get('auto_scale_lr'),
visualizer=MagicMock(spec=Visualizer),
custom_hooks=cfg.get('custom_hooks', None))
@@ -231,14 +233,13 @@ def main():
from mmaction.registry import DATASETS
dataset_size = len(DATASETS.build(cfg.train_dataloader.dataset))
print(f'dataset is {dataset_size}')
- # dataset_size = len(build_dataset(cfg.train_dataloader.dataset))
else:
dataset_size = args.dataset_size or batch_size
- class FakeDataloader(list):
- dataset = MagicMock(metainfo=None)
-
- data_loader = FakeDataloader(range(dataset_size // batch_size))
+ data_loader = DataLoader(range(dataset_size), batch_size)
+ assert len(data_loader) > 0, \
+ 'Please decrease batchsize to make sure that ' \
+ 'a epoch at least have one iteration!'
dataset_info = (
f'\nDataset infos:'
f'\n - Dataset size: {dataset_size}'