Skip to content

Commit

Permalink
Adding support for loading and visualizing masks
Browse files Browse the repository at this point in the history
Summary: This diff brings support for saving, loading and visualizing amodal and modal masks from HOT3D-Clips.

Reviewed By: prithvirb

Differential Revision: D61715780

fbshipit-source-id: 28dc96237c1037c7c68fd656240544f8a43dcca8
  • Loading branch information
thodan authored and facebook-github-bot committed Aug 23, 2024
1 parent 68b25bc commit bd632ef
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 9 deletions.
14 changes: 6 additions & 8 deletions hot3d/clips/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,10 @@ Files `<FRAME-ID>.objects.json` provide for each annotated object the following:
- `translation_xyz`: Translation from world to the object.
- `quaternion_wxyz`: Rotation from world to the object.
- `boxes_amodal`: A map from a stream ID to an amodal 2D bounding box.
- `masks_modal` [currently not available]: A map from a stream ID to an modal binary mask.
- `visibilities_modeled`: A map from a stream ID to the fraction of the projected surface area that is visibile.
(reflecting only occlusions by modeled scene elements).
- `visibilities_full` [currently not available]: A map from a stream ID to the fraction of the projected surface area that is visibile
(reflecting occlusions by modeled and unmodeled, such as arms, scene elements).
- `masks_amodal`: A map from a stream ID to an modal binary mask. These masks were obtained by rendering the 3D object models in the ground-truth poses and removing parts of the masks that are overlapping with the vignette mask, which covers heavily distroted corners of the fisheye images.
- `masks_modal`: A map from a stream ID to an modal binary mask. These masks were obtained using the SAM2-based approach from Taeyeop Lee (https://github.com/taeyeopl/bop_toolkit_sam2).
- `visibilities_modeled`: A map from a stream ID to the fraction of the projected surface area that is visibile. These visibility scores reflect only occlusions by modeled scene elements and the vignette mask.
- `visibilities_predicted`: A map from a stream ID to the fraction of the projected surface area that is visibile. These visibility scores reflect occlusions by modeled and unmodeled (e.g., arms, furniture) scene elements and were obtained using the SAM2-based approach from Taeyeop Lee (https://github.com/taeyeopl/bop_toolkit_sam2).

Files `<FRAME-ID>.hands.json` provide hand parameters:

Expand All @@ -100,11 +99,8 @@ Files `<FRAME-ID>.hands.json` provide hand parameters:
- `joint_angles`: 20 floats.
- `wrist_xform`: 4x4 3D rigid transformation matrix.
- `boxes_amodal`: A map from a stream ID to an amodal 2D bounding box.
- `masks_modal` [currently not available]: A map from a stream ID to an modal binary mask.
- `visibilities_modeled`: A map from a stream ID to the fraction of the projected surface area that is visibile.
(reflecting only occlusions by modeled scene elements).
- `visibilities_full` [currently not available]: A map from a stream ID to the fraction of the projected surface area that is visibile
(reflecting occlusions by modeled and unmodeled, such as arms, scene elements).
- `right`: As for `left`.

Files `<FRAME-ID>.hand_crops.json` provide hand crop parameters (used in [Hand Tracking Challenge](https://github.com/facebookresearch/hand_tracking_toolkit?tab=readme-ov-file#evaluation); a crop camera is saved only if the hand visibility > 0.1):
Expand Down Expand Up @@ -152,6 +148,8 @@ Optional arguments:
- `--mano_model_dir` is a folder with the MANO hand model (needs to be specified if `--hand_type mano`).
- `--clip_start` and `--clip_end` can be used to specify a range of clips to consider.
- `--undistort` is a binary flag indicating whether the images should be undistorted (warped from the original fisheye cameras to pinhole cameras; disabled by default).
- `--vis_amodal_masks` is a binary flag indicating whether to visualize amodal object masks.
- `--vis_modal_masks` is a binary flag indicating whether to visualize modal object masks.

An example command to visualize Quest3 training clips (`$HOT3DC` is assumed to be a path to [HOT3D-Clips](https://huggingface.co/datasets/bop-benchmark/datasets/tree/main/hot3d)):
```
Expand Down
40 changes: 39 additions & 1 deletion hot3d/clips/clip_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def get_hand_meshes(
hand_shape: HandShapeCollection,
hand_type: str = "umetrack",
mano_model: Optional[MANOHandModel] = None,
) -> Dict[str, trimesh.Trimesh]:
) -> Dict[HandSide, trimesh.Trimesh]:
"""Provides hand meshes of specified shape and poses.
Args:
Expand All @@ -199,6 +199,7 @@ def get_hand_meshes(

meshes: Dict[HandSide, trimesh.Trimesh] = {}
for hand_side, hand_pose in hand_poses.items():
assert mano_model is not None
_, hand_verts, hand_faces = visualization.get_keypoints_and_mesh(
hand_pose=hand_pose,
hand_shape=hand_shape,
Expand Down Expand Up @@ -374,3 +375,40 @@ def vis_mask_contours(
)[0]

return cv2.drawContours(image, contours, -1, color, thickness, cv2.LINE_AA)


def encode_binary_mask_rle(mask: np.ndarray) -> Dict[str, Any]:
"""Encodes a binary mask using Run-Length Encoding (RLE).
Args:
mask: An np.ndarray with the binary mask.
Returns:
The encoded mask.
"""

if mask.dtype != np.uint8:
mask = mask.astype(np.uint8)

pixels = np.concatenate([[0], mask.flatten(), [0]])
runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
runs[1::2] -= runs[::2]

return {"height": mask.shape[0], "width": mask.shape[1], "rle": runs}


def decode_binary_mask_rle(data: Dict[str, Any]) -> np.ndarray:
"""Decodes a binary mask that was encoded using `encode_binary_mask_rle`.
Args:
data: RLE-encoded mask (output of `encode_binary_mask_rle`).
Returns:
The decoded mask represented as an np.ndarray.
"""

starts = np.asarray(data["rle"][0:][::2]) - 1
ends = starts + np.asarray(data["rle"][1:][::2])
mask = np.zeros(data["height"] * data["width"], dtype=np.bool)
for lo, hi in zip(starts, ends):
mask[lo:hi] = True

return mask.reshape((data["height"], data["width"]))
45 changes: 45 additions & 0 deletions hot3d/clips/vis_clips.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def vis_clip(
hand_type: str,
mano_model: Optional[MANOHandModel],
undistort: bool,
vis_amodal_masks: bool,
vis_modal_masks: bool,
output_dir: str,
) -> None:
"""Visualizes hand and object models in GT poses for each frame of a clip.
Expand Down Expand Up @@ -115,6 +117,11 @@ def vis_clip(

# Visualize object contours.
if objects is not None:

masks_vis = None
if len(objects) and (vis_amodal_masks or vis_modal_masks):
masks_vis = np.zeros_like(image)

for instance_list in objects.values():
for instance in instance_list:
bop_id = int(instance["object_bop_id"])
Expand All @@ -140,6 +147,27 @@ def vis_clip(
# Visualize the object contour on top of the image.
image = clip_util.vis_mask_contours(image, mask, (0, 255, 0))

# Potentially load object mask.
if vis_amodal_masks or vis_modal_masks:
mask_key = (
"masks_amodal" if vis_amodal_masks else "masks_modal"
)
if stream_key in instance[mask_key]:
mask = clip_util.decode_binary_mask_rle(
instance[mask_key][stream_key]
)
masks_vis[mask] = 255

# Potentially visualize object masks.
if masks_vis is not None:
image_weight = 0.5
masks_vis = masks_vis.astype(np.float32)
image = ( # pyre-ignore
image_weight * image.astype(np.float32)
+ (1.0 - image_weight) * masks_vis
)
image = image.astype(np.uint8)

# Visualize hand contours.
for hand_mesh in hand_meshes.values():

Expand Down Expand Up @@ -195,6 +223,16 @@ def main() -> None:
default="umetrack",
help="Type of hand annotations to visualize ('umetrack' or 'mano').",
)
parser.add_argument(
"--vis_amodal_masks",
action="store_true",
help="Whether to visualize amodal masks of objects.",
)
parser.add_argument(
"--vis_modal_masks",
action="store_true",
help="Whether to visualize modal masks of objects.",
)
parser.add_argument(
"--clip_start",
type=int,
Expand All @@ -215,6 +253,11 @@ def main() -> None:
)
args = parser.parse_args()

if args.vis_amodal_masks and args.vis_modal_masks:
raise ValueError(
"Only either amodal or modal masks can be visualized at a time."
)

# Make sure the output directory exists.
os.makedirs(args.output_dir, exist_ok=True)

Expand Down Expand Up @@ -253,6 +296,8 @@ def main() -> None:
hand_type=args.hand_type,
mano_model=mano_model,
undistort=args.undistort,
vis_amodal_masks=args.vis_amodal_masks,
vis_modal_masks=args.vis_modal_masks,
output_dir=args.output_dir,
)

Expand Down

0 comments on commit bd632ef

Please sign in to comment.