Switch to row instead of trajectory (#376)

* Utilize `rows` for naming legacy `traj` dimension which is mostly relevant in oceanographic datasets while rows is more generalized * remove coord dim map and map coords to dim aliases. * Map dim alias to library required dims --------- Co-authored-by: Philippe Miron <[email protected]> Co-authored-by: Shane Elipot <[email protected]> Co-authored-by: Kevin Santana <[email protected]>
Cloud-Drift · Mar 12, 2024 · 749520b · 749520b
1 parent a370d68
commit 749520b
Show file tree

Hide file tree

Showing 18 changed files with 348 additions and 215 deletions.
diff --git a/clouddrift/adapters/andro.py b/clouddrift/adapters/andro.py
@@ -1,6 +1,6 @@
 """
-This module defines functions used to adapt the ANDRO: An Argo-based 
-deep displacement dataset as a ragged-arrays dataset. 
+This module defines functions used to adapt the ANDRO: An Argo-based
+deep displacement dataset as a ragged-arrays dataset.
 
 The dataset is hosted at https://www.seanoe.org/data/00360/47077/ and the user manual
 is available at https://archimer.ifremer.fr/doc/00360/47126/.
@@ -12,8 +12,8 @@
 
 Reference
 ---------
-Ollitrault Michel, Rannou Philippe, Brion Emilie, Cabanes Cecile, Piron Anne, Reverdin Gilles, 
-Kolodziejczyk Nicolas (2022). ANDRO: An Argo-based deep displacement dataset. 
+Ollitrault Michel, Rannou Philippe, Brion Emilie, Cabanes Cecile, Piron Anne, Reverdin Gilles,
+Kolodziejczyk Nicolas (2022). ANDRO: An Argo-based deep displacement dataset.
 SEANOE. https://doi.org/10.17882/47077
 """
 

diff --git a/clouddrift/adapters/gdp.py b/clouddrift/adapters/gdp.py
@@ -14,9 +14,11 @@
 from clouddrift.adapters.utils import download_with_progress
 from clouddrift.raggedarray import DimNames
 
-GDP_COORDS: list[tuple[str, DimNames]] = [
-    ("id", "traj"),
-    ("time", "obs"),
+GDP_DIMS: dict[str, DimNames] = {"traj": "rows", "obs": "obs"}
+
+GDP_COORDS = [
+    "id",
+    "time",
 ]
 
 GDP_METADATA = [

diff --git a/clouddrift/adapters/gdp1h.py b/clouddrift/adapters/gdp1h.py
@@ -23,9 +23,9 @@
 GDP_VERSION = "2.01"
 
 
-GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/hourly_product/v2.01/"
+GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/hourly_product/v2.01"
 GDP_DATA_URL_EXPERIMENTAL = (
-    "https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/experimental/"
+    "https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/experimental"
 )
 
 
@@ -113,7 +113,7 @@ def download(
     gdp_metadata = gdp.get_gdp_metadata()
 
     return gdp.order_by_date(
-        gdp_metadata, [int(f.split("_")[-1][:-3]) for f in filelist]
+        gdp_metadata, [int(f.split("_")[-1].removesuffix(".nc")) for f in filelist]
     )
 
 
@@ -215,35 +215,47 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
         [False if ds.get("location_type") == "Argos" else True],
     )  # 0 for Argos, 1 for GPS
     ds["DeployingShip"] = (("traj"), gdp.cut_str(ds.DeployingShip, 20))
-    ds["DeploymentStatus"] = (("traj"), gdp.cut_str(ds.DeploymentStatus, 20))
-    ds["BuoyTypeManufacturer"] = (("traj"), gdp.cut_str(ds.BuoyTypeManufacturer, 20))
-    ds["BuoyTypeSensorArray"] = (("traj"), gdp.cut_str(ds.BuoyTypeSensorArray, 20))
+    ds["DeploymentStatus"] = (
+        ("traj"),
+        gdp.cut_str(ds.DeploymentStatus, 20),
+    )
+    ds["BuoyTypeManufacturer"] = (
+        ("traj"),
+        gdp.cut_str(ds.BuoyTypeManufacturer, 20),
+    )
+    ds["BuoyTypeSensorArray"] = (
+        ("traj"),
+        gdp.cut_str(ds.BuoyTypeSensorArray, 20),
+    )
     ds["CurrentProgram"] = (
         ("traj"),
         np.array([gdp.str_to_float(ds.CurrentProgram, -1)], dtype=np.int32),
     )
-    ds["PurchaserFunding"] = (("traj"), gdp.cut_str(ds.PurchaserFunding, 20))
+    ds["PurchaserFunding"] = (
+        ("traj"),
+        gdp.cut_str(ds.PurchaserFunding, 20),
+    )
     ds["SensorUpgrade"] = (("traj"), gdp.cut_str(ds.SensorUpgrade, 20))
     ds["Transmissions"] = (("traj"), gdp.cut_str(ds.Transmissions, 20))
-    ds["DeployingCountry"] = (("traj"), gdp.cut_str(ds.DeployingCountry, 20))
-    ds["DeploymentComments"] = (
+    ds["DeployingCountry"] = (
         ("traj"),
-        gdp.cut_str(
-            ds.DeploymentComments.encode("ascii", "ignore").decode("ascii"), 20
-        ),
-    )  # remove non ascii char
-    ds["ManufactureYear"] = (
+        gdp.cut_str(ds.DeployingCountry, 20),
+    )
+    ds["DeploymentComments"] = (
         ("traj"),
         np.array([gdp.str_to_float(ds.ManufactureYear, -1)], dtype=np.int16),
     )
     ds["ManufactureMonth"] = (
         ("traj"),
         np.array([gdp.str_to_float(ds.ManufactureMonth, -1)], dtype=np.int16),
     )
-    ds["ManufactureSensorType"] = (("traj"), gdp.cut_str(ds.ManufactureSensorType, 20))
+    ds["ManufactureSensorType"] = (
+        ("traj"),
+        gdp.cut_str(ds.ManufactureSensorType, 20),
+    )
     ds["ManufactureVoltage"] = (
         ("traj"),
-        np.array([gdp.str_to_float(ds.ManufactureVoltage[:-6], -1)], dtype=np.int16),
+        np.array([gdp.str_to_float(ds.ManufactureVoltage[:-2], -1)], dtype=np.int16),
     )  # e.g. 56 V
     ds["FloatDiameter"] = (
         ("traj"),
@@ -270,12 +282,18 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
         ("traj"),
         [gdp.str_to_float(ds.DragAreaOfDrogue[:-4])],
     )  # e.g. 416.6 m^2
-    ds["DragAreaRatio"] = (("traj"), [gdp.str_to_float(ds.DragAreaRatio)])  # e.g. 39.08
+    ds["DragAreaRatio"] = (
+        ("traj"),
+        [gdp.str_to_float(ds.DragAreaRatio)],
+    )  # e.g. 39.08
     ds["DrogueCenterDepth"] = (
         ("traj"),
         [gdp.str_to_float(ds.DrogueCenterDepth[:-2])],
     )  # e.g. 20.0 m
-    ds["DrogueDetectSensor"] = (("traj"), gdp.cut_str(ds.DrogueDetectSensor, 20))
+    ds["DrogueDetectSensor"] = (
+        ("traj"),
+        gdp.cut_str(ds.DrogueDetectSensor, 20),
+    )
 
     # vars attributes
     vars_attrs = {
@@ -581,21 +599,22 @@ def to_raggedarray(
     ra = RaggedArray.from_files(
         indices=ids,
         preprocess_func=preprocess,
-        coord_dim_map=gdp.GDP_COORDS,
+        name_coords=gdp.GDP_COORDS,
         name_meta=gdp.GDP_METADATA,
         name_data=GDP_DATA,
+        name_dims=gdp.GDP_DIMS,
         rowsize_func=gdp.rowsize,
         filename_pattern=filename_pattern,
         tmp_path=tmp_path,
     )
 
     # set dynamic global attributes
     if ra.attrs_global:
-        ra.attrs_global[
-            "time_coverage_start"
-        ] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
-        ra.attrs_global[
-            "time_coverage_end"
-        ] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
+        ra.attrs_global["time_coverage_start"] = (
+            f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
+        )
+        ra.attrs_global["time_coverage_end"] = (
+            f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
+        )
 
     return ra
diff --git a/clouddrift/adapters/gdp6h.py b/clouddrift/adapters/gdp6h.py
@@ -21,7 +21,7 @@
 
 GDP_VERSION = "September 2023"
 
-GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/6h/"
+GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/6h"
 GDP_TMP_PATH = os.path.join(tempfile.gettempdir(), "clouddrift", "gdp6h")
 GDP_DATA = [
     "lon",
@@ -82,7 +82,7 @@ def download(
         string = urlpath.read().decode("utf-8")
         filelist = list(set(re.compile(pattern).findall(string)))
         for f in filelist:
-            did = int(f[:-3].split("_")[2])
+            did = int(f.split("_")[2].removesuffix(".nc"))
             if (drifter_ids is None or did in drifter_ids) and did not in added:
                 drifter_urls.append(f"{url}/{dir}/{f}")
                 added.add(did)
@@ -187,7 +187,10 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
             warnings.warn(f"Variable {var} not found in upstream data; skipping.")
 
     # new variables
-    ds["ids"] = (["traj", "obs"], [np.repeat(ds.ID.values, ds.sizes["obs"])])
+    ds["ids"] = (
+        ["traj", "obs"],
+        [np.repeat(ds.ID.values, ds.sizes["obs"])],
+    )
     ds["drogue_status"] = (
         ["traj", "obs"],
         [gdp.drogue_presence(ds.drogue_lost_date.data, ds.time.data[0])],
@@ -199,17 +202,32 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
         [False if ds.get("location_type") == "Argos" else True],
     )  # 0 for Argos, 1 for GPS
     ds["DeployingShip"] = (("traj"), gdp.cut_str(ds.DeployingShip, 20))
-    ds["DeploymentStatus"] = (("traj"), gdp.cut_str(ds.DeploymentStatus, 20))
-    ds["BuoyTypeManufacturer"] = (("traj"), gdp.cut_str(ds.BuoyTypeManufacturer, 20))
-    ds["BuoyTypeSensorArray"] = (("traj"), gdp.cut_str(ds.BuoyTypeSensorArray, 20))
+    ds["DeploymentStatus"] = (
+        ("traj"),
+        gdp.cut_str(ds.DeploymentStatus, 20),
+    )
+    ds["BuoyTypeManufacturer"] = (
+        ("traj"),
+        gdp.cut_str(ds.BuoyTypeManufacturer, 20),
+    )
+    ds["BuoyTypeSensorArray"] = (
+        ("traj"),
+        gdp.cut_str(ds.BuoyTypeSensorArray, 20),
+    )
     ds["CurrentProgram"] = (
         ("traj"),
         [np.int32(gdp.str_to_float(ds.CurrentProgram, -1))],
     )
-    ds["PurchaserFunding"] = (("traj"), gdp.cut_str(ds.PurchaserFunding, 20))
+    ds["PurchaserFunding"] = (
+        ("traj"),
+        gdp.cut_str(ds.PurchaserFunding, 20),
+    )
     ds["SensorUpgrade"] = (("traj"), gdp.cut_str(ds.SensorUpgrade, 20))
     ds["Transmissions"] = (("traj"), gdp.cut_str(ds.Transmissions, 20))
-    ds["DeployingCountry"] = (("traj"), gdp.cut_str(ds.DeployingCountry, 20))
+    ds["DeployingCountry"] = (
+        ("traj"),
+        gdp.cut_str(ds.DeployingCountry, 20),
+    )
     ds["DeploymentComments"] = (
         ("traj"),
         gdp.cut_str(
@@ -224,10 +242,13 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
         ("traj"),
         [np.int16(gdp.str_to_float(ds.ManufactureMonth, -1))],
     )
-    ds["ManufactureSensorType"] = (("traj"), gdp.cut_str(ds.ManufactureSensorType, 20))
+    ds["ManufactureSensorType"] = (
+        ("traj"),
+        gdp.cut_str(ds.ManufactureSensorType, 20),
+    )
     ds["ManufactureVoltage"] = (
         ("traj"),
-        [np.int16(gdp.str_to_float(ds.ManufactureVoltage[:-6], -1))],
+        [np.int16(gdp.str_to_float(ds.ManufactureVoltage[:-2], -1))],
     )  # e.g. 56 V
     ds["FloatDiameter"] = (
         ("traj"),
@@ -254,12 +275,18 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
         ("traj"),
         [gdp.str_to_float(ds.DragAreaOfDrogue[:-4])],
     )  # e.g. 416.6 m^2
-    ds["DragAreaRatio"] = (("traj"), [gdp.str_to_float(ds.DragAreaRatio)])  # e.g. 39.08
+    ds["DragAreaRatio"] = (
+        ("traj"),
+        [gdp.str_to_float(ds.DragAreaRatio)],
+    )  # e.g. 39.08
     ds["DrogueCenterDepth"] = (
         ("traj"),
         [gdp.str_to_float(ds.DrogueCenterDepth[:-2])],
     )  # e.g. 20.0 m
-    ds["DrogueDetectSensor"] = (("traj"), gdp.cut_str(ds.DrogueDetectSensor, 20))
+    ds["DrogueDetectSensor"] = (
+        ("traj"),
+        gdp.cut_str(ds.DrogueDetectSensor, 20),
+    )
 
     # vars attributes
     vars_attrs = {
@@ -481,20 +508,21 @@ def to_raggedarray(
     ra = RaggedArray.from_files(
         indices=ids,
         preprocess_func=preprocess,
-        coord_dim_map=gdp.GDP_COORDS,
+        name_coords=gdp.GDP_COORDS,
         name_meta=gdp.GDP_METADATA,
         name_data=GDP_DATA,
+        name_dims=gdp.GDP_DIMS,
         rowsize_func=gdp.rowsize,
         filename_pattern="drifter_6h_{id}.nc",
         tmp_path=tmp_path,
     )
 
     # update dynamic global attributes
-    ra.attrs_global[
-        "time_coverage_start"
-    ] = f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
-    ra.attrs_global[
-        "time_coverage_end"
-    ] = f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
+    ra.attrs_global["time_coverage_start"] = (
+        f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
+    )
+    ra.attrs_global["time_coverage_end"] = (
+        f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
+    )
 
     return ra
diff --git a/clouddrift/adapters/glad.py b/clouddrift/adapters/glad.py
@@ -13,6 +13,7 @@
 ---------
 Özgökmen, Tamay. 2013. GLAD experiment CODE-style drifter trajectories (low-pass filtered, 15 minute interval records), northern Gulf of Mexico near DeSoto Canyon, July-October 2012. Distributed by: Gulf of Mexico Research Initiative Information and Data Cooperative (GRIIDC), Harte Research Institute, Texas A&M University–Corpus Christi. doi:10.7266/N7VD6WC8
 """
+
 from io import BytesIO
 
 import numpy as np

diff --git a/clouddrift/adapters/mosaic.py b/clouddrift/adapters/mosaic.py
@@ -18,6 +18,7 @@
 >>> from clouddrift.adapters import mosaic
 >>> ds = mosaic.to_xarray()
 """
+
 import xml.etree.ElementTree as ET
 from datetime import datetime
 from io import BytesIO

diff --git a/clouddrift/adapters/subsurface_floats.py b/clouddrift/adapters/subsurface_floats.py
@@ -1,6 +1,6 @@
 """
-This module defines functions to adapt as a ragged-array dataset a collection of data 
-from 2193 trajectories of SOFAR, APEX, and RAFOS subsurface floats from 52 experiments 
+This module defines functions to adapt as a ragged-array dataset a collection of data
+from 2193 trajectories of SOFAR, APEX, and RAFOS subsurface floats from 52 experiments
 across the world between 1989 and 2015.
 
 The dataset is hosted at https://www.aoml.noaa.gov/phod/float_traj/index.php

diff --git a/clouddrift/adapters/utils.py b/clouddrift/adapters/utils.py
@@ -45,9 +45,9 @@ def download_with_progress(
         retry_protocol = custom_retry_protocol  # type: ignore
 
     executor = concurrent.futures.ThreadPoolExecutor()
-    futures: dict[
-        concurrent.futures.Future, Tuple[str, Union[BufferedIOBase, str]]
-    ] = dict()
+    futures: dict[concurrent.futures.Future, Tuple[str, Union[BufferedIOBase, str]]] = (
+        dict()
+    )
     bar = None
 
     for src, dst, exp_size in download_map:

diff --git a/clouddrift/adapters/yomaha.py b/clouddrift/adapters/yomaha.py
@@ -1,7 +1,7 @@
 """
-This module defines functions used to adapt the YoMaHa'07: Velocity data assessed 
-from trajectories of Argo floats at parking level and at the sea surface as 
-a ragged-arrays dataset. 
+This module defines functions used to adapt the YoMaHa'07: Velocity data assessed
+from trajectories of Argo floats at parking level and at the sea surface as
+a ragged-arrays dataset.
 
 The dataset is hosted at http://apdrc.soest.hawaii.edu/projects/yomaha/ and the user manual
 is available at http://apdrc.soest.hawaii.edu/projects/yomaha/yomaha07/YoMaHa070612.pdf.
@@ -52,7 +52,7 @@ def download(tmp_path: str):
     download_with_progress(download_requests)
 
     filename_gz = f"{tmp_path}/{YOMAHA_URLS[-1].split('/')[-1]}"
-    filename = filename_gz[:-3]
+    filename = filename_gz.removesuffix(".gz")
 
     buffer = BytesIO()
     download_with_progress([(YOMAHA_URLS[-1], buffer, None)])
@@ -153,7 +153,8 @@ def to_xarray(tmp_path: Union[str, None] = None):
     )
 
     # open with pandas
-    filename = f"{tmp_path}/{YOMAHA_URLS[-1].split('/')[-1][:-3]}"
+    filename_gz = f"{tmp_path}/{YOMAHA_URLS[-1].split('/')[-1]}"
+    filename = filename_gz.removesuffix(".gz")
     df = pd.read_csv(
         filename, names=col_names, sep=r"\s+", header=None, na_values=na_col
     )

diff --git a/clouddrift/datasets.py b/clouddrift/datasets.py
@@ -1,9 +1,10 @@
 """
-This module provides functions to easily access ragged array datasets. If the datasets are 
+This module provides functions to easily access ragged array datasets. If the datasets are
 not accessed via cloud storage platforms or are not found on the local filesystem,
-they will be downloaded from their upstream repositories and stored for later access 
+they will be downloaded from their upstream repositories and stored for later access
 (~/.clouddrift for UNIX-based systems).
 """
+
 import os
 import platform
 from io import BytesIO

diff --git a/clouddrift/pairs.py b/clouddrift/pairs.py
@@ -1,6 +1,7 @@
 """
 Functions to analyze pairs of contiguous data segments.
 """
+
 import itertools
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Optional, Tuple, Union
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,6 +13,7 @@ @@
     ---------
     Özgökmen, Tamay. 2013. GLAD experiment CODE-style drifter trajectories (low-pass filtered, 15 minute interval records), northern Gulf of Mexico near DeSoto Canyon, July-October 2012. Distributed by: Gulf of Mexico Research Initiative Information and Data Cooperative (GRIIDC), Harte Research Institute, Texas A&M University–Corpus Christi. doi:10.7266/N7VD6WC8
     """
     from io import BytesIO
     import numpy as np
@@ Expand Down @@