openvinotoolkit · sooahleex · Sep 12, 2023 · Sep 12, 2023 · Sep 12, 2023 · Sep 13, 2023
@@ -6,7 +6,11 @@ on:
       - 'develop'
       - 'releases/*'
   pull_request:
-    types: [edited, ready_for_review, opened, synchronize, reopened]
+    types:
+      - opened
+      - synchronize
+      - reopened
+      - ready_for_review
 
 # This is what will cancel the workflow concurrency
 concurrency:
@@ -20,8 +24,8 @@ jobs:
   pr_test:
     if: |
       github.event.pull_request.draft == false &&
-      !startsWith(github.event.pull_request.title, '[WIP]') &&
-      !startsWith(github.event.pull_request.title, '[Dependent]')
+      !(startsWith(github.event.pull_request.title, '[WIP]')) &&
+      !(startsWith(github.event.pull_request.title, '[Dependent]'))
     strategy:
       fail-fast: false
       matrix:

@@ -23,6 +23,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fix memory unbounded Arrow data format export/import
   (<https://github.com/openvinotoolkit/datumaro/pull/1169>)
 
+## 15/09/2023 - Release 1.5.0
+### New features
+- Add SAMAutomaticMaskGeneration transform
+  (<https://github.com/openvinotoolkit/datumaro/pull/1168>)
+
+### Bug fixes
+- Coco exporter can export annotations even if there is no media, except for mask annotations which require media info.
+  (<https://github.com/openvinotoolkit/datumaro/issues/1147>)(<https://github.com/openvinotoolkit/datumaro/pull/1158>)
+
 ## 15/09/2023 - Release 1.5.0
 ### New features
 - Add tabular data import/export

@@ -129,8 +129,20 @@ Extra options for exporting to CVAT format:
   (by default `False`)
 - `--image-ext IMAGE_EXT` allow to specify image extension
   for exporting dataset (by default - keep original or use `.jpg`, if none)
-- `--save-dataset-meta` - allow to export dataset with saving dataset meta
+- `--save-dataset-meta` allow to export dataset with saving dataset meta
   file (by default `False`)
+- `--reindex` assign new indices to frames
+- `--allow-undeclared-attrs` write annotation attributes even if they are not present in the input dataset metainfo
+
+When performing `convert` to CVAT format, you may encounter a warning message like the following:
+```bash
+skipping undeclared attribute 'is_crowd' for label '<label>' (allow with --allow-undeclared-attrs option)
+```
+In such cases, you can bypass this warning by using the `export` command as follows:
+```bash
+datum project export -o <output/dir> -p <path/to/project> -f cvat -- --allow-undeclared-attrs
+```
+This allows you to proceed with the export while bypassing the warning.
 
 ## Examples
 

@@ -101,6 +101,7 @@ Supported Data Formats
 * CVAT (`for images`, `for video` (import-only))
    * `Format specification <https://opencv.github.io/cvat/docs/manual/advanced/xml_format>`_
    * `Dataset example <https://github.com/openvinotoolkit/datumaro/tree/develop/tests/assets/cvat_dataset>`_
+   * `Format documentation <cvat.md>`_
 * ICDAR13/15 (``word recognition``, ``text localization``, ``text segmentation``)
    * `Format specification <https://rrc.cvc.uab.es/?ch=2>`_
    * `Dataset example <https://github.com/openvinotoolkit/datumaro/tree/develop/tests/assets/icdar_dataset>`_

@@ -15,6 +15,7 @@ Please refer `here <https://github.com/openvinotoolkit/datumaro/blob/develop/src
    notebooks/05_transform
    notebooks/06_tiling
    notebooks/18_bbox_to_instance_mask_using_sam
+   notebooks/19_automatic_instance_mask_gen_using_sam
 
 .. grid:: 1 2 2 2
    :gutter: 2
@@ -44,6 +45,15 @@ Please refer `here <https://github.com/openvinotoolkit/datumaro/blob/develop/src
 
       This transform uses Segment Anything Model [2]_ to transform bounding box annotations to instance mask annotations.
 
+   .. grid-item-card::
+
+      .. button-ref:: notebooks/19_automatic_instance_mask_gen_using_sam
+         :color: primary
+         :outline:
+         :expand:
+
+      This transform uses Segment Anything Model [2]_ to generate instance maks annotations automatically.
+
 References
 ^^^^^^^^^^
 

@@ -381,7 +381,7 @@ def find_instance_parts(self, group, img_width, img_height):
             )
 
         if use_masks:
-            if polygons:
+            if polygons and img_width > 0 and img_height > 0:
                 mask = mask_tools.rles_to_mask(polygons, img_width, img_height)
 
             if masks:
@@ -415,14 +415,16 @@ def save_annotations(self, item):
             return
 
         if not item.media or not item.media.size:
+            h, w = 0, 0
             log.warning(
-                "Item '%s': skipping writing instances " "since no image info available" % item.id
+                "Item '%s': Mask annotations can be skipped since no image info available" % item.id
             )
-            return
-        h, w = item.media.size
+        else:
+            h, w = item.media.size
+
         instances = [self.find_instance_parts(i, w, h) for i in instances]
 
-        if self._context._crop_covered:
+        if self._context._crop_covered and w > 0 and h > 0:
             instances = self.crop_segments(instances, w, h)
 
         for instance in instances:

@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: MIT
 """Transforms using Segment Anything Model"""
+from .automatic_mask_gen import SAMAutomaticMaskGeneration
 from .bbox_to_inst_mask import SAMBboxToInstanceMask
 
-__all__ = ["SAMBboxToInstanceMask"]
+__all__ = ["SAMBboxToInstanceMask", "SAMAutomaticMaskGeneration"]
@@ -0,0 +1,181 @@
+# Copyright (C) 2023 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+"""Automatic mask generation using Segment Anything Model"""
+
+import os.path as osp
+from typing import List, Optional
+
+import numpy as np
+
+import datumaro.plugins.sam_transforms.interpreters.sam_decoder_for_amg as sam_decoder_for_amg
+import datumaro.plugins.sam_transforms.interpreters.sam_encoder as sam_encoder_interp
+from datumaro.components.cli_plugin import CliPlugin
+from datumaro.components.dataset_base import DatasetItem, IDataset
+from datumaro.components.transformer import ModelTransform
+from datumaro.plugins.inference_server_plugin import OVMSLauncher, TritonLauncher
+from datumaro.plugins.inference_server_plugin.base import (
+    InferenceServerType,
+    ProtocolType,
+    TLSConfig,
+)
+from datumaro.plugins.sam_transforms.interpreters.sam_decoder_for_amg import AMGMasks, AMGPoints
+
+__all__ = ["SAMAutomaticMaskGeneration"]
+
+
+class SAMAutomaticMaskGeneration(ModelTransform, CliPlugin):
+    """Produce instance segmentation masks automatically using Segment Anything Model (SAM).
+
+    This transform can produce instance segmentation mask annotations for each given image.
+    It samples single-point input prompts on a uniform 2D grid over the image.
+    For each prompt, SAM can predict multiple masks. After obtaining the mask candidates,
+    it post-processes them using the given parameters to improve quality and remove duplicates.
+
+    It uses the Segment Anything Model deployed in the OpenVINO™ Model Server
+    or NVIDIA Triton™ Inference Server instance. To launch the server instance,
+    please see the guide in this link:
+    https://github.com/openvinotoolkit/datumaro/tree/develop/docker/segment-anything/README.md
+
+    Parameters:
+        extractor: Dataset to transform
+        inference_server_type: Inference server type:
+            `InferenceServerType.ovms` or `InferenceServerType.triton`
+        host: Host address of the server instance
+        port: Port number of the server instance
+        timeout: Timeout limit during communication between the client and the server instance
+        tls_config: Configuration required if the server instance is in the secure mode
+        protocol_type: Communication protocol type with the server instance
+        num_workers: The number of worker threads to use for parallel inference.
+            Set to 0 for single-process mode. Default is 0.
+        points_per_side (int): The number of points to be sampled
+            along one side of the image. The total number of points is
+            points_per_side**2 on a uniform 2d grid.
+        points_per_batch (int): Sets the number of points run simultaneously
+            by the model. Higher numbers may be faster but use more GPU memory.
+        pred_iou_thresh (float): A filtering threshold in [0,1], using the
+            model's predicted mask quality.
+        stability_score_thresh (float): A filtering threshold in [0,1], using
+            the stability of the mask under changes to the cutoff used to binarize
+            the model's mask predictions.
+        stability_score_offset (float): The amount to shift the cutoff when
+            calculated the stability score.
+        box_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks.
+        min_mask_region_area (int): If >0, postprocessing will be applied
+            to remove the binary mask which has the number of 1s less than min_mask_region_area.
+    """
+
+    def __init__(
+        self,
+        extractor: IDataset,
+        inference_server_type: InferenceServerType = InferenceServerType.ovms,
+        host: str = "localhost",
+        port: int = 9000,
+        timeout: float = 10.0,
+        tls_config: Optional[TLSConfig] = None,
+        protocol_type: ProtocolType = ProtocolType.grpc,
+        num_workers: int = 0,
+        points_per_side: int = 32,
+        points_per_batch: int = 128,
+        mask_threshold: float = 0.0,
+        pred_iou_thresh: float = 0.88,
+        stability_score_thresh: float = 0.95,
+        stability_score_offset: float = 1.0,
+        box_nms_thresh: float = 0.7,
+        min_mask_region_area: int = 0,
+    ):
+        if inference_server_type == InferenceServerType.ovms:
+            launcher_cls = OVMSLauncher
+        elif inference_server_type == InferenceServerType.triton:
+            launcher_cls = TritonLauncher
+        else:
+            raise ValueError(inference_server_type)
+
+        self._sam_encoder_launcher = launcher_cls(
+            model_name="sam_encoder",
+            model_interpreter_path=osp.abspath(sam_encoder_interp.__file__),
+            model_version=1,
+            host=host,
+            port=port,
+            timeout=timeout,
+            tls_config=tls_config,
+            protocol_type=protocol_type,
+        )
+        self._sam_decoder_launcher = launcher_cls(
+            model_name="sam_decoder",
+            model_interpreter_path=osp.abspath(sam_decoder_for_amg.__file__),
+            model_version=1,
+            host=host,
+            port=port,
+            timeout=timeout,
+            tls_config=tls_config,
+            protocol_type=protocol_type,
+        )
+
+        self.points_per_side = points_per_side
+        self.points_per_batch = points_per_batch
+        self.mask_threshold = mask_threshold
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_offset = stability_score_offset
+        self.stability_score_thresh = stability_score_thresh
+        self.box_nms_thresh = box_nms_thresh
+        self.min_mask_region_area = min_mask_region_area
+
+        super().__init__(
+            extractor,
+            launcher=self._sam_encoder_launcher,
+            batch_size=1,
+            append_annotation=False,
+            num_workers=num_workers,
+        )
+
+    @property
+    def points_per_side(self) -> int:
+        return self._points_per_side
+
+    @points_per_side.setter
+    def points_per_side(self, points_per_side: int) -> None:
+        points_y = (np.arange(points_per_side) + 0.5) / points_per_side
+        points_x = (np.arange(points_per_side) + 0.5) / points_per_side
+
+        points_x = np.tile(points_x[None, :], (points_per_side, 1))
+        points_y = np.tile(points_y[:, None], (1, points_per_side))
+        self._points_grid = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
+        self._points_per_side = points_per_side
+
+    def _process_batch(
+        self,
+        batch: List[DatasetItem],
+    ) -> List[DatasetItem]:
+        img_embeds = self._sam_encoder_launcher.launch(
+            batch=[item for item in batch if self._sam_encoder_launcher.type_check(item)]
+        )
+
+        items = []
+        for item, img_embed in zip(batch, img_embeds):
+            amg_masks: List[AMGMasks] = []
+
+            for i in range(0, len(self._points_grid), self.points_per_batch):
+                amg_points = [AMGPoints(points=self._points_grid[i : i + self.points_per_batch])]
+                item_to_decode = item.wrap(annotations=amg_points + img_embed)
+
+                # Nested list of mask [[mask_0, ...]]
+                nested_masks: List[List[AMGMasks]] = self._sam_decoder_launcher.launch(
+                    [item_to_decode],
+                    stack=False,
+                )
+                amg_masks += nested_masks[0]
+
+            mask_anns = AMGMasks.cat(amg_masks).postprocess(
+                mask_threshold=self.mask_threshold,
+                pred_iou_thresh=self.pred_iou_thresh,
+                stability_score_offset=self.stability_score_offset,
+                stability_score_thresh=self.stability_score_thresh,
+                box_nms_thresh=self.box_nms_thresh,
+                min_mask_region_area=self.min_mask_region_area,
+            )
+
+            items.append(item.wrap(annotations=mask_anns))
+
+        return items