From ad83fa3aab8e21bb7d9d5ba18b51a6b91836df12 Mon Sep 17 00:00:00 2001 From: sooahleex Date: Sun, 1 Sep 2024 13:22:13 +0900 Subject: [PATCH] Update comments and docs --- .../context_free/transform.md | 31 +++++++++++++++++++ src/datumaro/plugins/transforms.py | 21 +++++++++++++ tests/unit/test_transforms.py | 7 ++--- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/docs/source/docs/command-reference/context_free/transform.md b/docs/source/docs/command-reference/context_free/transform.md index c33bdc8359..6ecbdbc384 100644 --- a/docs/source/docs/command-reference/context_free/transform.md +++ b/docs/source/docs/command-reference/context_free/transform.md @@ -102,6 +102,7 @@ Basic dataset item manipulations: - [`remove_annotations`](#remove_annotations) - Removes annotations - [`remove_attributes`](#remove_attributes) - Removes attributes - [`astype_annotations`](#astype_annotations) - Convert annotation type +- [`pseudo_labeling`](#pseudo_labeling) - Generate pseudo labels for unlabeled data Subset manipulations: - [`random_split`](#random_split) - Splits dataset into subsets @@ -838,3 +839,33 @@ correct [-h] [-r REPORT_PATH] Optional arguments: - `-h`, `--help` (flag) - Show this help message and exit - `-r`, `--reports` (str) - A validation report from a 'validate' CLI (default=validation_reports.json) + +#### `pseudo_labeling` + +Assigns pseudo-labels to items in a dataset based on their similarity to predefined labels. This class is useful for semi-supervised learning when dealing with missing or uncertain labels. + +The process includes: + +- Similarity Computation: Uses hashing techniques to compute the similarity between items and predefined labels. +- Pseudo-Label Assignment: Assigns the most similar label as a pseudo-label to each item. + +Attributes: + +- `extractor` (IDataset) - Provides access to dataset items and their annotations. +- `labels` (Optional[List[str]]) - List of predefined labels for pseudo-labeling. Defaults to all available labels if not provided. +- `explorer` (Optional[Explorer]) - Computes hash keys for items and labels. If not provided, a new Explorer is created. + +Usage: +```console +pseudo_labeling [-h] [--labels LABELS] +``` + +Optional arguments: +- `-h`, `--help` (flag) - Show this help message and exit +- `--labels` (str) - Comma-separated list of label names for pseudo-labeling + +Examples: +- Assign pseudo-labels based on predefined labels + ```console + datum transform -t pseudo_labeling -- --labels 'label1,label2' + ``` diff --git a/src/datumaro/plugins/transforms.py b/src/datumaro/plugins/transforms.py index b95f941284..62b17288fb 100644 --- a/src/datumaro/plugins/transforms.py +++ b/src/datumaro/plugins/transforms.py @@ -2010,6 +2010,27 @@ def transform_item(self, item): class PseudoLabeling(ItemTransform): + """ + A class used to assign pseudo-labels to items in a dataset based on + their similarity to predefined labels.|n + |n + This class leverages hashing techniques to compute the similarity + between dataset items and a set of predefined labels.|n + It assigns the most similar label as a pseudo-label to each item. + This is particularly useful in semi-supervised + learning scenarios where some labels are missing or uncertain.|n + |n + Attributes:|n + - extractor : IDataset|n + The dataset extractor that provides access to dataset items and their annotations.|n + - labels : Optional[List[str]]|n + A list of label names to be used for pseudo-labeling. + If not provided, all available labels in the dataset will be used.|n + - explorer : Optional[Explorer]|n + An optional Explorer object used to compute hash keys for items and labels. + If not provided, a new Explorer will be created.|n + """ + def __init__( self, extractor: IDataset, diff --git a/tests/unit/test_transforms.py b/tests/unit/test_transforms.py index ce538c9a00..25f01caff8 100644 --- a/tests/unit/test_transforms.py +++ b/tests/unit/test_transforms.py @@ -2,6 +2,7 @@ import argparse import logging as log +import os import os.path as osp import random from unittest import TestCase @@ -14,6 +15,7 @@ import datumaro.plugins.transforms as transforms import datumaro.util.mask_tools as mask_tools +from datumaro.components.algorithms.hash_key_inference.explorer import Explorer from datumaro.components.annotation import ( AnnotationType, Bbox, @@ -1675,11 +1677,6 @@ def test_transform_clean_after_astype_ann(self): self.assertEqual(expected_item.media, result_item.media) -import os - -from datumaro.components.algorithms.hash_key_inference.explorer import Explorer - - class PseudoLabelingTest(TestCase): def setUp(self): self.data_path = get_test_asset_path("explore_dataset")