facebookresearch · Zehui-Lin · May 28, 2021 · Jun 3, 2021 · Jun 8, 2021 · Jun 8, 2021
diff --git a/slowfast/datasets/transform.py b/slowfast/datasets/transform.py
@@ -47,7 +47,7 @@ def random_short_side_scale_jitter(
     corresponding boxes.
     Args:
         images (tensor): images to perform scale jitter. Dimension is
-            `num frames` x `channel` x `height` x `width`.
+            `channel` x `num frames` x `height` x `width`.
         min_size (int): the minimal size to scale the frames.
         max_size (int): the maximal size to scale the frames.
         boxes (ndarray): optional. Corresponding boxes to images.
@@ -120,7 +120,7 @@ def random_crop(images, size, boxes=None):
     Perform random spatial crop on the given images and corresponding boxes.
     Args:
         images (tensor): images to perform random crop. The dimension is
-            `num frames` x `channel` x `height` x `width`.
+            `channel` x `num frames` x `height` x `width`.
         size (int): the size of height and width to crop on the image.
         boxes (ndarray or None): optional. Corresponding boxes to images.
             Dimension is `num boxes` x 4.
@@ -157,12 +157,12 @@ def horizontal_flip(prob, images, boxes=None):
     Args:
         prob (float): probility to flip the images.
         images (tensor): images to perform horizontal flip, the dimension is
-            `num frames` x `channel` x `height` x `width`.
+            `channel` x `num frames` x `height` x `width`.
         boxes (ndarray or None): optional. Corresponding boxes to images.
             Dimension is `num boxes` x 4.
     Returns:
         images (tensor): images with dimension of
-            `num frames` x `channel` x `height` x `width`.
+            `channel` x `num frames` x `height` x `width`.
         flipped_boxes (ndarray or None): the flipped boxes with dimension of
             `num boxes` x 4.
     """

diff --git a/slowfast/datasets/utils.py b/slowfast/datasets/utils.py
@@ -125,7 +125,7 @@ def spatial_sampling(
     with the given spatial_idx.
     Args:
         frames (tensor): frames of images sampled from the video. The
-            dimension is `num frames` x `height` x `width` x `channel`.
+            dimension is `channel` x `num frames` x `height` x `width`.
         spatial_idx (int): if -1, perform random spatial sampling. If 0, 1,
             or 2, perform left, center, right crop if width is larger than
             height, and perform top, center, buttom crop if height is larger