From ad83fa3aab8e21bb7d9d5ba18b51a6b91836df12 Mon Sep 17 00:00:00 2001
From: sooahleex <sooah.lee@intel.com>
Date: Sun, 1 Sep 2024 13:22:13 +0900
Subject: [PATCH] Update comments and docs

---
 .../context_free/transform.md                 | 31 +++++++++++++++++++
 src/datumaro/plugins/transforms.py            | 21 +++++++++++++
 tests/unit/test_transforms.py                 |  7 ++---
 3 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/docs/source/docs/command-reference/context_free/transform.md b/docs/source/docs/command-reference/context_free/transform.md
index c33bdc8359..6ecbdbc384 100644
--- a/docs/source/docs/command-reference/context_free/transform.md
+++ b/docs/source/docs/command-reference/context_free/transform.md
@@ -102,6 +102,7 @@ Basic dataset item manipulations:
 - [`remove_annotations`](#remove_annotations) - Removes annotations
 - [`remove_attributes`](#remove_attributes) - Removes attributes
 - [`astype_annotations`](#astype_annotations) - Convert annotation type
+- [`pseudo_labeling`](#pseudo_labeling) - Generate pseudo labels for unlabeled data
 
 Subset manipulations:
 - [`random_split`](#random_split) - Splits dataset into subsets
@@ -838,3 +839,33 @@ correct [-h] [-r REPORT_PATH]
 Optional arguments:
 - `-h`, `--help` (flag) - Show this help message and exit
 - `-r`, `--reports` (str) - A validation report from a 'validate' CLI (default=validation_reports.json)
+
+#### `pseudo_labeling`
+
+Assigns pseudo-labels to items in a dataset based on their similarity to predefined labels. This class is useful for semi-supervised learning when dealing with missing or uncertain labels.
+
+The process includes:
+
+- Similarity Computation: Uses hashing techniques to compute the similarity between items and predefined labels.
+- Pseudo-Label Assignment: Assigns the most similar label as a pseudo-label to each item.
+
+Attributes:
+
+- `extractor` (IDataset) - Provides access to dataset items and their annotations.
+- `labels` (Optional[List[str]]) - List of predefined labels for pseudo-labeling. Defaults to all available labels if not provided.
+- `explorer` (Optional[Explorer]) - Computes hash keys for items and labels. If not provided, a new Explorer is created.
+
+Usage:
+```console
+pseudo_labeling [-h] [--labels LABELS]
+```
+
+Optional arguments:
+- `-h`, `--help` (flag) - Show this help message and exit
+- `--labels` (str) - Comma-separated list of label names for pseudo-labeling
+
+Examples:
+- Assign pseudo-labels based on predefined labels
+  ```console
+  datum transform -t pseudo_labeling -- --labels 'label1,label2'
+  ```
diff --git a/src/datumaro/plugins/transforms.py b/src/datumaro/plugins/transforms.py
index b95f941284..62b17288fb 100644
--- a/src/datumaro/plugins/transforms.py
+++ b/src/datumaro/plugins/transforms.py
@@ -2010,6 +2010,27 @@ def transform_item(self, item):
 
 
 class PseudoLabeling(ItemTransform):
+    """
+    A class used to assign pseudo-labels to items in a dataset based on
+    their similarity to predefined labels.|n
+    |n
+    This class leverages hashing techniques to compute the similarity
+    between dataset items and a set of predefined labels.|n
+    It assigns the most similar label as a pseudo-label to each item.
+    This is particularly useful in semi-supervised
+    learning scenarios where some labels are missing or uncertain.|n
+    |n
+    Attributes:|n
+        - extractor : IDataset|n
+        The dataset extractor that provides access to dataset items and their annotations.|n
+        - labels : Optional[List[str]]|n
+        A list of label names to be used for pseudo-labeling.
+        If not provided, all available labels in the dataset will be used.|n
+        - explorer : Optional[Explorer]|n
+        An optional Explorer object used to compute hash keys for items and labels.
+        If not provided, a new Explorer will be created.|n
+    """
+
     def __init__(
         self,
         extractor: IDataset,
diff --git a/tests/unit/test_transforms.py b/tests/unit/test_transforms.py
index ce538c9a00..25f01caff8 100644
--- a/tests/unit/test_transforms.py
+++ b/tests/unit/test_transforms.py
@@ -2,6 +2,7 @@
 
 import argparse
 import logging as log
+import os
 import os.path as osp
 import random
 from unittest import TestCase
@@ -14,6 +15,7 @@
 
 import datumaro.plugins.transforms as transforms
 import datumaro.util.mask_tools as mask_tools
+from datumaro.components.algorithms.hash_key_inference.explorer import Explorer
 from datumaro.components.annotation import (
     AnnotationType,
     Bbox,
@@ -1675,11 +1677,6 @@ def test_transform_clean_after_astype_ann(self):
             self.assertEqual(expected_item.media, result_item.media)
 
 
-import os
-
-from datumaro.components.algorithms.hash_key_inference.explorer import Explorer
-
-
 class PseudoLabelingTest(TestCase):
     def setUp(self):
         self.data_path = get_test_asset_path("explore_dataset")