Skip to content

Commit

Permalink
Merge pull request #22 from Relifest/main
Browse files Browse the repository at this point in the history
Modification and testing of stack_converter functionality and resolution of issues in the previous pull request
  • Loading branch information
Relifest authored Nov 23, 2024
2 parents e808920 + ea5d650 commit f01f98c
Show file tree
Hide file tree
Showing 18 changed files with 835 additions and 183 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
${{ runner.os }}-pip-
- name: Install dependencies
run: pip install .[dev] && pip install .[all]
run: pip install pytest && pip install .[all]

- name: Run tests
run: pytest
59 changes: 35 additions & 24 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,42 +46,53 @@ The training dataset can also be encoded to TrainingDML-AI JSON format with Pyth

```python
from pytdml.io import write_to_json
from pytdml.type import EOTrainingDataset, EOTrainingData, EOTask, EODataSource, SceneLabel
from pytdml.type import EOTrainingDataset, AI_EOTrainingData, AI_EOTask, AI_SceneLabel

# generate EO training dataset
dataset = EOTrainingDataset(
id='...',
name='...',
description='...',
license='...',
tasks=[
AI_EOTask(
id='...',
task_type='...'),
...
],
data=[
EOTrainingData(
AI_EOTrainingData(
id='...',
data_URL='...',
labels=[
SceneLabel(
label_class='...',
data_url='...',
date_time='...'),
AI_SceneLabel(
label_class='...'
),
...
]),
...
],
version="...",

amount_of_training_data=...,
created_time="...",
updated_time="...",
providers=["..."],
keywords=["...", "..."],
tasks=[EOTask(task_type="...",
description="...")],
data_sources=[EODataSource(
id="...",
data_type="...",
resolution="..."
)],
classes=["...", "...", "..."],
classification_scheme='...',
created_time="...",
data_sources=['...'],
doi='...',
keywords=['...', '...'],
number_of_classes=...,
bands=["...", "...", "..."],
image_size="..."
providers=['...'],
scope=...,
statistics_info=[...],
updated_time='...',
version='...',
labeling=[...],
metrics_in_LIT=[...],
quality=[...],
changesets=[...],
bands=[...],
extent=[...],
image_size='...'
)
# write to json
write_to_json(dataset, "dataset.json")
Expand Down Expand Up @@ -362,14 +373,14 @@ for e in range(100):

### Convert other EO dataset formats to TrainingDML-AI format

- convert coco format to TrainingDMl-AI format:
- convert stac format to TrainingDMl-AI format:

```python
from pytdml.convert_utils import convert_coco_to_tdml,convert_stac_to_tdml
from pytdml.io.stac_converter import convert_stac_to_tdml

coco_path = "/mnt/example/coco_file.json"
stac_path = "/mnt/example/stac_file.json"
output_path = "convert_result.json"

convert_coco_to_tdml(coco_path, output_path)
dataset = convert_stac_to_tdml(stac_path)
```

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ dependencies = [
"Pillow~=10.4.0",
"PyYAML~=6.0",
"numpy~=1.26.4",
"pandas~=2.2.2",
"opencv-python~=4.10.0.84",
"tensorflow~=2.17.0",
"jsonschema~=4.23.0",
Expand All @@ -21,7 +20,8 @@ dependencies = [
"matplotlib~=3.9.1",
"minio~=7.2.7",
"tqdm~=4.66.4",
"s3fs~=2024.9.0"
"s3fs~=2024.9.0",
"pystac~=1.10.1"
]

[project.optional-dependencies]
Expand Down
93 changes: 0 additions & 93 deletions pytdml/convert_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,96 +152,3 @@ def convert_coco_to_tdml(coco_dataset_path, output_json_path):
write_to_json(dataset, output_json_path)


def convert_stac_to_tdml(stac_dataset_path, output_json_path):
# Reads JSON data in stac format from a given path.

with open(stac_dataset_path, 'r') as stac_file:
stac_collection_dataset = json.load(stac_file)
# start of timer
start_time = time.time()
dataset_id = stac_collection_dataset.get("id")
dataset_description = stac_collection_dataset.get("description")
dataset_name = stac_collection_dataset.get("title")
dataset_version = stac_collection_dataset.get("version")

keywords = stac_collection_dataset.get("keywords")
license_str = stac_collection_dataset.get("license")
extents = stac_collection_dataset.get("extent")

extent = extents.get("spatial").get("bbox")[0]
print(extent, type(extent))

providers = [item["name"] for item in stac_collection_dataset.get("providers")]

created_time = extents.get("temporal").get("interval")[0][0][:-1]
updated_time = extents.get("temporal").get("interval")[0][1]
datas = [item for item in stac_collection_dataset.get("links") if item["rel"] == "item"]
amount_of_training_data = len(datas)
td_list = []

task_name = ""

for data in datas:
item_path = data.get("href")
with open(item_path, 'r') as itemfile:
stac_item = json.load(itemfile)
properties = stac_item.get("properties")
assets = stac_item.get("assets")
task_type = properties.get("label:tasks")[0] # list
label_classes = properties.get("label:classes") # list<dict>
label_methods = properties.get("label:methods") # list
item_extent = stac_item.get("bbox")
label_path = assets["labels"].get("href")
label_type = assets["labels"].get("type")
item_id = stac_item.get("id")
img_path = assets["raster"].get("href")
data_url = []

if task_type == "segmentation":
task_name = "semantic segmentation"
data_url.append(img_path)
label_url = label_path
image_type = label_type
labels = [AI_PixelLabel(confidence=1.0,type="AI_PixelLabel",image_URL=[label_url],image_format=[image_type])]
td_list.append(
AI_EOTrainingData(id=item_id,type="AI_EOTrainingData",training_type="Train", dataset_id=dataset_id,number_of_labels=1,labels=labels,extent=item_extent,
data_URL=data_url))


for class_dict in label_classes:
class_dict['value'] = class_dict.pop('classes')

tasks = [AI_EOTask(task_type=task_name,
id=str(dataset_id) + "_task",
dataset_id= str(dataset_id),
type='AI_EOTask')]

# end of timer
end_time = time.time()
# Calculation of total and average time
total_time = end_time - start_time
average_time = total_time / amount_of_training_data
print(f"Total time for {amount_of_training_data} training instances: {total_time:.5f} seconds")
print(f"Average time per training instance: {average_time * 60:.5f} ms")

dataset = EOTrainingDataset(
id=str(dataset_id),
name=dataset_name,
type="AI_EOTrainingDataset",
description=dataset_description,
tasks=tasks,
version=dataset_version,
amount_of_training_data=amount_of_training_data,
created_time=created_time,
updated_time=updated_time,
providers=providers,
keywords=keywords,
classes=label_classes,
number_of_classes=len(label_classes),
license=license_str,
data=td_list,
extent=extent
)
# write to json
write_to_json(dataset, output_json_path)

120 changes: 120 additions & 0 deletions pytdml/io/stac_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
#
# Project: pytdml
# Authors: Boyi Shangguan, Kaixuan Wang, Zhaoyan Wu
# Created: 2022-05-04
# Modified: 2023-10-27
# Email: [email protected]
#
# ------------------------------------------------------------------------------
#
# Copyright (c) 2022 OGC Training Data Markup Language for AI Standard Working Group
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# ------------------------------------------------------------------------------

import json
import re
from datetime import datetime
from geojson import Feature
from pystac import Collection
from pytdml.type import EOTrainingDataset, AI_EOTrainingData, AI_ObjectLabel, AI_EOTask


def convert_stac_to_tdml(stac_dataset_path):
# Reads JSON data in stac format from a given path.
with open(stac_dataset_path, 'r') as file:
collection_data = json.load(file)
collection_object = Collection.from_dict(collection_data)
stac_collection_dataset = collection_object.to_dict(include_self_link=False, transform_hrefs=True)

# Reads the necessary attributes from the Collection object and maps them to the EOTrainingDataset object
collection_version = stac_collection_dataset.get("stac_version")
collection_id = stac_collection_dataset.get("id")
collection_description = stac_collection_dataset.get("description")
collection_license = stac_collection_dataset.get("license")
collection_bbox = stac_collection_dataset.get("extent").get("spatial").get("bbox")
collection_interval = stac_collection_dataset.get("extent").get("temporal").get("interval")
data_time = []
for item in collection_interval:
for time in item:
cleaned_date_time_str = re.sub(r"(\\+00:00|Z)$", "", time)
date_time_obj = datetime.strptime(cleaned_date_time_str, "%Y-%m-%dT%H:%M:%S.%f")
formatted_date_time_str = date_time_obj.strftime("%Y-%m-%dT%H:%M:%S")
data_time.append(formatted_date_time_str)

if len(collection_bbox) == 1:
collection_extent = collection_bbox[0]
else:
collection_extent = [item for bbox in collection_bbox for item in bbox]

# Reads the necessary attributes from the item object and maps them to the data object
collection_links = stac_collection_dataset.get("links")
collection_filtered_links = [link for link in collection_links if link.get("rel") == "item"]

datalist = []
for link in collection_filtered_links:
item_path = link.get("href")
with open(item_path, 'r') as item_file:
stac_item = json.load(item_file)
link_id = stac_item.get("id")
link_rel = link.get("rel")
feature = Feature(**stac_item)
link_href = [asset['href'] for asset in stac_item.get("assets").values()]

label = AI_ObjectLabel(
type = "AI_ObjectLabel",
object = feature,
label_class = link_rel
)

data = AI_EOTrainingData(
type = "AI_EOTrainingData",
id = link_id,
labels = [label],
data_URL = link_href,
data_time = data_time
)
datalist.append(data)

# Reads the unnecessary attributes from the Collection object and maps them to the EOTrainingDataset object
collection_name = stac_collection_dataset.get("title")

tasks = [AI_EOTask(task_type="STAC",
id=str(collection_id) + "_task",
dataset_id= str(collection_id),
type='AI_EOTask')]

dataset = EOTrainingDataset(
# necessary attributes
id = str(collection_id),
name = collection_name,
description = collection_description,
license = collection_license,
tasks = tasks,
data = datalist,
type="AI_EOTrainingDataset",
# unnecessary attributes
version = collection_version,
extent = collection_extent
)

return dataset
2 changes: 1 addition & 1 deletion pytdml/type/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from .basic_types import TrainingDataset
from .extended_types import AI_PixelLabel
from .extended_types import AI_ObjectLabel
from .extended_types import AI_ObjectLabel
from .extended_types import AI_SceneLabel
from .extended_types import AI_EOTask
from .extended_types import AI_EOTrainingData
from .extended_types import EOTrainingDataset
6 changes: 3 additions & 3 deletions pytdml/type/basic_types.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
#
# Project: pytdml
# Authors: Boyi Shangguan, Kaixuan Wang, Zhaoyan Wu
Expand Down Expand Up @@ -1289,8 +1290,8 @@ class AI_Label(BaseCamelModel):

type: Literal["AI_AbstractLabel"]

is_negative: Optional[bool] = None
confidence: Optional[float] = Field(None, ge=0.0, le=1.0)
is_negative: Optional[bool] = Field(False)
confidence: Optional[float] = Field(1.0, ge=0.0, le=1.0)

def to_dict(self):
return self.model_dump(by_alias=True, exclude_none=True)
Expand Down Expand Up @@ -1329,7 +1330,6 @@ def to_dict(self):
def from_dict(json_dict):
from pytdml.type.extended_types import AI_PixelLabel, AI_ObjectLabel, AI_SceneLabel
new_dict = copy.deepcopy(json_dict)
new_dict = copy.deepcopy(json_dict)
if new_dict.__contains__('labels'):
labels = new_dict['labels']
for i in range(len(labels)):
Expand Down
4 changes: 1 addition & 3 deletions pytdml/yaml_to_tdml.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,5 @@ def main():
if training_datasets:
write_to_json(training_datasets, json_path)


if __name__ == '__main__':
result = yaml_to_eo_tdml("D:\\Project\\pyTDML3\\pytdml\\pytdml\\type\\UiT_HCD_California_2017.yml")
print(result.to_dict())
main()
Loading

0 comments on commit f01f98c

Please sign in to comment.