Adding support for loading and visualizing masks

Summary: This diff brings support for saving, loading and visualizing amodal and modal masks from HOT3D-Clips. Reviewed By: prithvirb Differential Revision: D61715780 fbshipit-source-id: 28dc96237c1037c7c68fd656240544f8a43dcca8
facebookresearch · Aug 23, 2024 · bd632ef · bd632ef
1 parent 68b25bc
commit bd632ef
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 9 deletions.
diff --git a/hot3d/clips/README.md b/hot3d/clips/README.md
@@ -83,11 +83,10 @@ Files `<FRAME-ID>.objects.json` provide for each annotated object the following:
     - `translation_xyz`: Translation from world to the object.
     - `quaternion_wxyz`: Rotation from world to the object.
 - `boxes_amodal`: A map from a stream ID to an amodal 2D bounding box.
-- `masks_modal` [currently not available]: A map from a stream ID to an modal binary mask.
-- `visibilities_modeled`: A map from a stream ID to the fraction of the projected surface area that is visibile.
-        (reflecting only occlusions by modeled scene elements).
-- `visibilities_full` [currently not available]: A map from a stream ID to the fraction of the projected surface area that is visibile
-    (reflecting occlusions by modeled and unmodeled, such as arms, scene elements).
+- `masks_amodal`: A map from a stream ID to an modal binary mask. These masks were obtained by rendering the 3D object models in the ground-truth poses and removing parts of the masks that are overlapping with the vignette mask, which covers heavily distroted corners of the fisheye images.
+- `masks_modal`: A map from a stream ID to an modal binary mask. These masks were obtained using the SAM2-based approach from Taeyeop Lee (https://github.com/taeyeopl/bop_toolkit_sam2).
+- `visibilities_modeled`: A map from a stream ID to the fraction of the projected surface area that is visibile. These visibility scores reflect only occlusions by modeled scene elements and the vignette mask.
+- `visibilities_predicted`: A map from a stream ID to the fraction of the projected surface area that is visibile. These visibility scores reflect occlusions by modeled and unmodeled (e.g., arms, furniture) scene elements and were obtained using the SAM2-based approach from Taeyeop Lee (https://github.com/taeyeopl/bop_toolkit_sam2).
 
 Files `<FRAME-ID>.hands.json` provide hand parameters:
 
@@ -100,11 +99,8 @@ Files `<FRAME-ID>.hands.json` provide hand parameters:
         - `joint_angles`: 20 floats.
         - `wrist_xform`: 4x4 3D rigid transformation matrix.
     - `boxes_amodal`: A map from a stream ID to an amodal 2D bounding box.
-    - `masks_modal` [currently not available]: A map from a stream ID to an modal binary mask.
     - `visibilities_modeled`: A map from a stream ID to the fraction of the projected surface area that is visibile.
         (reflecting only occlusions by modeled scene elements).
-    - `visibilities_full` [currently not available]: A map from a stream ID to the fraction of the projected surface area that is visibile
-        (reflecting occlusions by modeled and unmodeled, such as arms, scene elements).
 - `right`: As for `left`.
 
 Files `<FRAME-ID>.hand_crops.json` provide hand crop parameters (used in [Hand Tracking Challenge](https://github.com/facebookresearch/hand_tracking_toolkit?tab=readme-ov-file#evaluation); a crop camera is saved only if the hand visibility > 0.1):
@@ -152,6 +148,8 @@ Optional arguments:
 - `--mano_model_dir` is a folder with the MANO hand model (needs to be specified if `--hand_type mano`).
 - `--clip_start` and `--clip_end` can be used to specify a range of clips to consider.
 - `--undistort` is a binary flag indicating whether the images should be undistorted (warped from the original fisheye cameras to pinhole cameras; disabled by default).
+- `--vis_amodal_masks` is a binary flag indicating whether to visualize amodal object masks.
+- `--vis_modal_masks` is a binary flag indicating whether to visualize modal object masks.
 
 An example command to visualize Quest3 training clips (`$HOT3DC` is assumed to be a path to [HOT3D-Clips](https://huggingface.co/datasets/bop-benchmark/datasets/tree/main/hot3d)):
 ```

diff --git a/hot3d/clips/clip_util.py b/hot3d/clips/clip_util.py
@@ -179,7 +179,7 @@ def get_hand_meshes(
     hand_shape: HandShapeCollection,
     hand_type: str = "umetrack",
     mano_model: Optional[MANOHandModel] = None,
-) -> Dict[str, trimesh.Trimesh]:
+) -> Dict[HandSide, trimesh.Trimesh]:
     """Provides hand meshes of specified shape and poses.
 
     Args:
@@ -199,6 +199,7 @@ def get_hand_meshes(
 
     meshes: Dict[HandSide, trimesh.Trimesh] = {}
     for hand_side, hand_pose in hand_poses.items():
+        assert mano_model is not None
         _, hand_verts, hand_faces = visualization.get_keypoints_and_mesh(
             hand_pose=hand_pose,
             hand_shape=hand_shape,
@@ -374,3 +375,40 @@ def vis_mask_contours(
     )[0]
 
     return cv2.drawContours(image, contours, -1, color, thickness, cv2.LINE_AA)
+
+
+def encode_binary_mask_rle(mask: np.ndarray) -> Dict[str, Any]:
+    """Encodes a binary mask using Run-Length Encoding (RLE).
+
+    Args:
+        mask: An np.ndarray with the binary mask.
+    Returns:
+        The encoded mask.
+    """
+
+    if mask.dtype != np.uint8:
+        mask = mask.astype(np.uint8)
+
+    pixels = np.concatenate([[0], mask.flatten(), [0]])
+    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
+    runs[1::2] -= runs[::2]
+
+    return {"height": mask.shape[0], "width": mask.shape[1], "rle": runs}
+
+
+def decode_binary_mask_rle(data: Dict[str, Any]) -> np.ndarray:
+    """Decodes a binary mask that was encoded using `encode_binary_mask_rle`.
+
+    Args:
+        data: RLE-encoded mask (output of `encode_binary_mask_rle`).
+    Returns:
+        The decoded mask represented as an np.ndarray.
+    """
+
+    starts = np.asarray(data["rle"][0:][::2]) - 1
+    ends = starts + np.asarray(data["rle"][1:][::2])
+    mask = np.zeros(data["height"] * data["width"], dtype=np.bool)
+    for lo, hi in zip(starts, ends):
+        mask[lo:hi] = True
+
+    return mask.reshape((data["height"], data["width"]))
diff --git a/hot3d/clips/vis_clips.py b/hot3d/clips/vis_clips.py
@@ -35,6 +35,8 @@ def vis_clip(
     hand_type: str,
     mano_model: Optional[MANOHandModel],
     undistort: bool,
+    vis_amodal_masks: bool,
+    vis_modal_masks: bool,
     output_dir: str,
 ) -> None:
     """Visualizes hand and object models in GT poses for each frame of a clip.
@@ -115,6 +117,11 @@ def vis_clip(
 
             # Visualize object contours.
             if objects is not None:
+
+                masks_vis = None
+                if len(objects) and (vis_amodal_masks or vis_modal_masks):
+                    masks_vis = np.zeros_like(image)
+
                 for instance_list in objects.values():
                     for instance in instance_list:
                         bop_id = int(instance["object_bop_id"])
@@ -140,6 +147,27 @@ def vis_clip(
                         # Visualize the object contour on top of the image.
                         image = clip_util.vis_mask_contours(image, mask, (0, 255, 0))
 
+                        # Potentially load object mask.
+                        if vis_amodal_masks or vis_modal_masks:
+                            mask_key = (
+                                "masks_amodal" if vis_amodal_masks else "masks_modal"
+                            )
+                            if stream_key in instance[mask_key]:
+                                mask = clip_util.decode_binary_mask_rle(
+                                    instance[mask_key][stream_key]
+                                )
+                                masks_vis[mask] = 255
+
+                # Potentially visualize object masks.
+                if masks_vis is not None:
+                    image_weight = 0.5
+                    masks_vis = masks_vis.astype(np.float32)
+                    image = (  # pyre-ignore
+                        image_weight * image.astype(np.float32)
+                        + (1.0 - image_weight) * masks_vis
+                    )
+                    image = image.astype(np.uint8)
+
             # Visualize hand contours.
             for hand_mesh in hand_meshes.values():
 
@@ -195,6 +223,16 @@ def main() -> None:
         default="umetrack",
         help="Type of hand annotations to visualize ('umetrack' or 'mano').",
     )
+    parser.add_argument(
+        "--vis_amodal_masks",
+        action="store_true",
+        help="Whether to visualize amodal masks of objects.",
+    )
+    parser.add_argument(
+        "--vis_modal_masks",
+        action="store_true",
+        help="Whether to visualize modal masks of objects.",
+    )
     parser.add_argument(
         "--clip_start",
         type=int,
@@ -215,6 +253,11 @@ def main() -> None:
     )
     args = parser.parse_args()
 
+    if args.vis_amodal_masks and args.vis_modal_masks:
+        raise ValueError(
+            "Only either amodal or modal masks can be visualized at a time."
+        )
+
     # Make sure the output directory exists.
     os.makedirs(args.output_dir, exist_ok=True)
 
@@ -253,6 +296,8 @@ def main() -> None:
             hand_type=args.hand_type,
             mano_model=mano_model,
             undistort=args.undistort,
+            vis_amodal_masks=args.vis_amodal_masks,
+            vis_modal_masks=args.vis_modal_masks,
             output_dir=args.output_dir,
         )