Merge pull request #22 from Relifest/main

Modification and testing of stack_converter functionality and resolution of issues in the previous pull request
openrsgis · Nov 23, 2024 · f01f98c · f01f98c
2 parents e808920 + ea5d650
commit f01f98c
Show file tree

Hide file tree

Showing 18 changed files with 835 additions and 183 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -26,7 +26,7 @@ jobs:
           ${{ runner.os }}-pip-
 
     - name: Install dependencies
-      run: pip install .[dev] && pip install .[all]
+      run: pip install pytest && pip install .[all]
 
     - name: Run tests
       run: pytest
diff --git a/README.md b/README.md
@@ -46,42 +46,53 @@ The training dataset can also be encoded to TrainingDML-AI JSON format with Pyth
 
 ```python
 from pytdml.io import write_to_json
-from pytdml.type import EOTrainingDataset, EOTrainingData, EOTask, EODataSource, SceneLabel
+from pytdml.type import EOTrainingDataset, AI_EOTrainingData, AI_EOTask, AI_SceneLabel
 
 # generate EO training dataset
 dataset = EOTrainingDataset(
     id='...',
     name='...',
     description='...',
+    license='...',
+    tasks=[
+        AI_EOTask(
+            id='...',
+            task_type='...'),
+        ...
+    ],
     data=[
-        EOTrainingData(
+        AI_EOTrainingData(
             id='...',
+            data_URL='...',
             labels=[
-                SceneLabel(
-                    label_class='...',
-                    data_url='...',
-                    date_time='...'),
+                AI_SceneLabel(
+                    label_class='...'
+                ),
                 ...
             ]),
         ...
     ],
-    version="...",
+
     amount_of_training_data=...,
-    created_time="...",
-    updated_time="...",
-    providers=["..."],
-    keywords=["...", "..."],
-    tasks=[EOTask(task_type="...",
-                  description="...")],
-    data_sources=[EODataSource(
-        id="...",
-        data_type="...",
-        resolution="..."
-    )],
     classes=["...", "...", "..."],
+    classification_scheme='...',
+    created_time="...",
+    data_sources=['...'],
+    doi='...',
+    keywords=['...', '...'],
     number_of_classes=...,
-    bands=["...", "...", "..."],
-    image_size="..."
+    providers=['...'],
+    scope=...,
+    statistics_info=[...],
+    updated_time='...',
+    version='...',
+    labeling=[...],
+    metrics_in_LIT=[...],
+    quality=[...],
+    changesets=[...],
+    bands=[...],
+    extent=[...],
+    image_size='...'
 )
 # write to json
 write_to_json(dataset, "dataset.json")
@@ -362,14 +373,14 @@ for e in range(100):
 
 ### Convert other EO dataset formats to TrainingDML-AI format
 
-- convert coco format to TrainingDMl-AI format:
+- convert stac format to TrainingDMl-AI format:
 
 ```python
-from pytdml.convert_utils import convert_coco_to_tdml,convert_stac_to_tdml
+from pytdml.io.stac_converter import convert_stac_to_tdml
 
-coco_path = "/mnt/example/coco_file.json"
+stac_path = "/mnt/example/stac_file.json"
 output_path = "convert_result.json"
 
-convert_coco_to_tdml(coco_path, output_path)
+dataset = convert_stac_to_tdml(stac_path)
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,6 @@ dependencies = [
     "Pillow~=10.4.0",
     "PyYAML~=6.0",
     "numpy~=1.26.4",
-    "pandas~=2.2.2",
     "opencv-python~=4.10.0.84",
     "tensorflow~=2.17.0",
     "jsonschema~=4.23.0",
@@ -21,7 +20,8 @@ dependencies = [
     "matplotlib~=3.9.1",
     "minio~=7.2.7",
     "tqdm~=4.66.4",
-    "s3fs~=2024.9.0"
+    "s3fs~=2024.9.0",
+    "pystac~=1.10.1"
 ]
 
 [project.optional-dependencies]

diff --git a/pytdml/convert_utils.py b/pytdml/convert_utils.py
@@ -152,96 +152,3 @@ def convert_coco_to_tdml(coco_dataset_path, output_json_path):
     write_to_json(dataset, output_json_path)
 
 
-def convert_stac_to_tdml(stac_dataset_path, output_json_path):
-    # Reads JSON data in stac format from a given path.
-
-    with open(stac_dataset_path, 'r') as stac_file:
-        stac_collection_dataset = json.load(stac_file)
-    # start of timer
-    start_time = time.time()
-    dataset_id = stac_collection_dataset.get("id")
-    dataset_description = stac_collection_dataset.get("description")
-    dataset_name = stac_collection_dataset.get("title")
-    dataset_version = stac_collection_dataset.get("version")
-
-    keywords = stac_collection_dataset.get("keywords")
-    license_str = stac_collection_dataset.get("license")
-    extents = stac_collection_dataset.get("extent")
-
-    extent = extents.get("spatial").get("bbox")[0]
-    print(extent, type(extent))
-
-    providers = [item["name"] for item in stac_collection_dataset.get("providers")]
-
-    created_time = extents.get("temporal").get("interval")[0][0][:-1]
-    updated_time = extents.get("temporal").get("interval")[0][1]
-    datas = [item for item in stac_collection_dataset.get("links") if item["rel"] == "item"]
-    amount_of_training_data = len(datas)
-    td_list = []
-
-    task_name = ""
-
-    for data in datas:
-        item_path = data.get("href")
-        with open(item_path, 'r') as itemfile:
-            stac_item = json.load(itemfile)
-        properties = stac_item.get("properties")
-        assets = stac_item.get("assets")
-        task_type = properties.get("label:tasks")[0]  # list
-        label_classes = properties.get("label:classes")  # list<dict>
-        label_methods = properties.get("label:methods")  # list
-        item_extent = stac_item.get("bbox")
-        label_path = assets["labels"].get("href")
-        label_type = assets["labels"].get("type")
-        item_id = stac_item.get("id")
-        img_path = assets["raster"].get("href")
-        data_url = []
-
-        if task_type == "segmentation":
-            task_name = "semantic segmentation"
-            data_url.append(img_path)
-            label_url = label_path
-            image_type = label_type
-            labels = [AI_PixelLabel(confidence=1.0,type="AI_PixelLabel",image_URL=[label_url],image_format=[image_type])]
-            td_list.append(
-                AI_EOTrainingData(id=item_id,type="AI_EOTrainingData",training_type="Train", dataset_id=dataset_id,number_of_labels=1,labels=labels,extent=item_extent,
-                               data_URL=data_url))
-
-
-    for class_dict in label_classes:
-        class_dict['value'] = class_dict.pop('classes')
-
-    tasks = [AI_EOTask(task_type=task_name,
-                      id=str(dataset_id) + "_task",
-                    dataset_id= str(dataset_id),
-                      type='AI_EOTask')]
-
-    # end of timer
-    end_time = time.time()
-    # Calculation of total and average time
-    total_time = end_time - start_time
-    average_time = total_time / amount_of_training_data
-    print(f"Total time for {amount_of_training_data} training instances: {total_time:.5f} seconds")
-    print(f"Average time per training instance: {average_time * 60:.5f} ms")
-
-    dataset = EOTrainingDataset(
-        id=str(dataset_id),
-        name=dataset_name,
-        type="AI_EOTrainingDataset",
-        description=dataset_description,
-        tasks=tasks,
-        version=dataset_version,
-        amount_of_training_data=amount_of_training_data,
-        created_time=created_time,
-        updated_time=updated_time,
-        providers=providers,
-        keywords=keywords,
-        classes=label_classes,
-        number_of_classes=len(label_classes),
-        license=license_str,
-        data=td_list,
-        extent=extent
-    )
-    # write to json
-    write_to_json(dataset, output_json_path)
-
diff --git a/pytdml/io/stac_converter.py b/pytdml/io/stac_converter.py
@@ -0,0 +1,120 @@
+# ------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
+#
+# Project: pytdml
+# Authors: Boyi Shangguan, Kaixuan Wang, Zhaoyan Wu
+# Created: 2022-05-04
+# Modified: 2023-10-27
+# Email: [email protected]
+#
+# ------------------------------------------------------------------------------
+#
+# Copyright (c) 2022 OGC Training Data Markup Language for AI Standard Working Group
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ------------------------------------------------------------------------------
+
+import json
+import re
+from datetime import datetime
+from geojson import Feature
+from pystac import Collection
+from pytdml.type import EOTrainingDataset, AI_EOTrainingData, AI_ObjectLabel, AI_EOTask
+
+
+def convert_stac_to_tdml(stac_dataset_path):
+    # Reads JSON data in stac format from a given path.
+    with open(stac_dataset_path, 'r') as file:
+        collection_data = json.load(file)
+    collection_object = Collection.from_dict(collection_data)
+    stac_collection_dataset = collection_object.to_dict(include_self_link=False, transform_hrefs=True)
+
+    # Reads the necessary attributes from the Collection object and maps them to the EOTrainingDataset object
+    collection_version = stac_collection_dataset.get("stac_version")
+    collection_id = stac_collection_dataset.get("id")
+    collection_description = stac_collection_dataset.get("description")
+    collection_license = stac_collection_dataset.get("license")
+    collection_bbox = stac_collection_dataset.get("extent").get("spatial").get("bbox")
+    collection_interval = stac_collection_dataset.get("extent").get("temporal").get("interval")
+    data_time = []
+    for item in collection_interval:
+        for time in item:
+            cleaned_date_time_str = re.sub(r"(\\+00:00|Z)$", "", time)
+            date_time_obj = datetime.strptime(cleaned_date_time_str, "%Y-%m-%dT%H:%M:%S.%f")
+            formatted_date_time_str = date_time_obj.strftime("%Y-%m-%dT%H:%M:%S")
+            data_time.append(formatted_date_time_str)
+
+    if len(collection_bbox) == 1:
+        collection_extent = collection_bbox[0]
+    else:
+        collection_extent = [item for bbox in collection_bbox for item in bbox]
+
+    # Reads the necessary attributes from the item object and maps them to the data object
+    collection_links = stac_collection_dataset.get("links")
+    collection_filtered_links = [link for link in collection_links if link.get("rel") == "item"]
+
+    datalist = []
+    for link in collection_filtered_links:
+        item_path = link.get("href")
+        with open(item_path, 'r') as item_file:
+            stac_item = json.load(item_file)
+        link_id = stac_item.get("id")
+        link_rel = link.get("rel")
+        feature = Feature(**stac_item)
+        link_href = [asset['href'] for asset in stac_item.get("assets").values()]
+
+        label = AI_ObjectLabel(
+            type = "AI_ObjectLabel",
+            object = feature,
+            label_class = link_rel
+        )
+
+        data = AI_EOTrainingData(
+            type = "AI_EOTrainingData",
+            id = link_id,
+            labels = [label],
+            data_URL = link_href,
+            data_time = data_time
+        )
+        datalist.append(data)
+
+    # Reads the unnecessary attributes from the Collection object and maps them to the EOTrainingDataset object
+    collection_name = stac_collection_dataset.get("title")
+
+    tasks = [AI_EOTask(task_type="STAC",
+                       id=str(collection_id) + "_task",
+                       dataset_id= str(collection_id),
+                       type='AI_EOTask')]
+
+    dataset = EOTrainingDataset(
+        # necessary attributes
+        id = str(collection_id),
+        name = collection_name,
+        description = collection_description,
+        license = collection_license,
+        tasks = tasks,
+        data = datalist,
+        type="AI_EOTrainingDataset",
+        # unnecessary attributes
+        version = collection_version,
+        extent = collection_extent
+    )
+
+    return dataset
diff --git a/pytdml/type/__init__.py b/pytdml/type/__init__.py
@@ -43,7 +43,7 @@
 from .basic_types import TrainingDataset
 from .extended_types import AI_PixelLabel
 from .extended_types import AI_ObjectLabel
-from .extended_types import AI_ObjectLabel
+from .extended_types import AI_SceneLabel
 from .extended_types import AI_EOTask
 from .extended_types import AI_EOTrainingData
 from .extended_types import EOTrainingDataset
diff --git a/pytdml/type/basic_types.py b/pytdml/type/basic_types.py
@@ -1,4 +1,5 @@
 # ------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 #
 # Project: pytdml
 # Authors: Boyi Shangguan, Kaixuan Wang, Zhaoyan Wu
@@ -1289,8 +1290,8 @@ class AI_Label(BaseCamelModel):
 
     type: Literal["AI_AbstractLabel"]
 
-    is_negative: Optional[bool] = None
-    confidence: Optional[float] = Field(None, ge=0.0, le=1.0)
+    is_negative: Optional[bool] = Field(False)
+    confidence: Optional[float] = Field(1.0, ge=0.0, le=1.0)
 
     def to_dict(self):
         return self.model_dump(by_alias=True, exclude_none=True)
@@ -1329,7 +1330,6 @@ def to_dict(self):
     def from_dict(json_dict):
         from pytdml.type.extended_types import AI_PixelLabel, AI_ObjectLabel, AI_SceneLabel
         new_dict = copy.deepcopy(json_dict)
-        new_dict = copy.deepcopy(json_dict)
         if new_dict.__contains__('labels'):
             labels = new_dict['labels']
             for i in range(len(labels)):

diff --git a/pytdml/yaml_to_tdml.py b/pytdml/yaml_to_tdml.py
@@ -322,7 +322,5 @@ def main():
     if training_datasets:
         write_to_json(training_datasets, json_path)
 
-
 if __name__ == '__main__':
-    result = yaml_to_eo_tdml("D:\\Project\\pyTDML3\\pytdml\\pytdml\\type\\UiT_HCD_California_2017.yml")
-    print(result.to_dict())
+    main()