Skip to content

Commit

Permalink
Switch to row instead of trajectory (#376)
Browse files Browse the repository at this point in the history
* Utilize `rows` for naming  legacy `traj` dimension which is mostly relevant in oceanographic datasets while rows is more generalized
* remove coord dim map and map coords to dim aliases.
* Map dim alias to library required dims 

---------

Co-authored-by: Philippe Miron <[email protected]>
Co-authored-by: Shane Elipot <[email protected]>
Co-authored-by: Kevin Santana <[email protected]>
  • Loading branch information
4 people authored Mar 12, 2024
1 parent a370d68 commit 749520b
Show file tree
Hide file tree
Showing 18 changed files with 348 additions and 215 deletions.
8 changes: 4 additions & 4 deletions clouddrift/adapters/andro.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
This module defines functions used to adapt the ANDRO: An Argo-based
deep displacement dataset as a ragged-arrays dataset.
This module defines functions used to adapt the ANDRO: An Argo-based
deep displacement dataset as a ragged-arrays dataset.
The dataset is hosted at https://www.seanoe.org/data/00360/47077/ and the user manual
is available at https://archimer.ifremer.fr/doc/00360/47126/.
Expand All @@ -12,8 +12,8 @@
Reference
---------
Ollitrault Michel, Rannou Philippe, Brion Emilie, Cabanes Cecile, Piron Anne, Reverdin Gilles,
Kolodziejczyk Nicolas (2022). ANDRO: An Argo-based deep displacement dataset.
Ollitrault Michel, Rannou Philippe, Brion Emilie, Cabanes Cecile, Piron Anne, Reverdin Gilles,
Kolodziejczyk Nicolas (2022). ANDRO: An Argo-based deep displacement dataset.
SEANOE. https://doi.org/10.17882/47077
"""

Expand Down
8 changes: 5 additions & 3 deletions clouddrift/adapters/gdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
from clouddrift.adapters.utils import download_with_progress
from clouddrift.raggedarray import DimNames

GDP_COORDS: list[tuple[str, DimNames]] = [
("id", "traj"),
("time", "obs"),
GDP_DIMS: dict[str, DimNames] = {"traj": "rows", "obs": "obs"}

GDP_COORDS = [
"id",
"time",
]

GDP_METADATA = [
Expand Down
69 changes: 44 additions & 25 deletions clouddrift/adapters/gdp1h.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
GDP_VERSION = "2.01"


GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/hourly_product/v2.01/"
GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/hourly_product/v2.01"
GDP_DATA_URL_EXPERIMENTAL = (
"https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/experimental/"
"https://www.aoml.noaa.gov/ftp/pub/phod/lumpkin/hourly/experimental"
)


Expand Down Expand Up @@ -113,7 +113,7 @@ def download(
gdp_metadata = gdp.get_gdp_metadata()

return gdp.order_by_date(
gdp_metadata, [int(f.split("_")[-1][:-3]) for f in filelist]
gdp_metadata, [int(f.split("_")[-1].removesuffix(".nc")) for f in filelist]
)


Expand Down Expand Up @@ -215,35 +215,47 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
[False if ds.get("location_type") == "Argos" else True],
) # 0 for Argos, 1 for GPS
ds["DeployingShip"] = (("traj"), gdp.cut_str(ds.DeployingShip, 20))
ds["DeploymentStatus"] = (("traj"), gdp.cut_str(ds.DeploymentStatus, 20))
ds["BuoyTypeManufacturer"] = (("traj"), gdp.cut_str(ds.BuoyTypeManufacturer, 20))
ds["BuoyTypeSensorArray"] = (("traj"), gdp.cut_str(ds.BuoyTypeSensorArray, 20))
ds["DeploymentStatus"] = (
("traj"),
gdp.cut_str(ds.DeploymentStatus, 20),
)
ds["BuoyTypeManufacturer"] = (
("traj"),
gdp.cut_str(ds.BuoyTypeManufacturer, 20),
)
ds["BuoyTypeSensorArray"] = (
("traj"),
gdp.cut_str(ds.BuoyTypeSensorArray, 20),
)
ds["CurrentProgram"] = (
("traj"),
np.array([gdp.str_to_float(ds.CurrentProgram, -1)], dtype=np.int32),
)
ds["PurchaserFunding"] = (("traj"), gdp.cut_str(ds.PurchaserFunding, 20))
ds["PurchaserFunding"] = (
("traj"),
gdp.cut_str(ds.PurchaserFunding, 20),
)
ds["SensorUpgrade"] = (("traj"), gdp.cut_str(ds.SensorUpgrade, 20))
ds["Transmissions"] = (("traj"), gdp.cut_str(ds.Transmissions, 20))
ds["DeployingCountry"] = (("traj"), gdp.cut_str(ds.DeployingCountry, 20))
ds["DeploymentComments"] = (
ds["DeployingCountry"] = (
("traj"),
gdp.cut_str(
ds.DeploymentComments.encode("ascii", "ignore").decode("ascii"), 20
),
) # remove non ascii char
ds["ManufactureYear"] = (
gdp.cut_str(ds.DeployingCountry, 20),
)
ds["DeploymentComments"] = (
("traj"),
np.array([gdp.str_to_float(ds.ManufactureYear, -1)], dtype=np.int16),
)
ds["ManufactureMonth"] = (
("traj"),
np.array([gdp.str_to_float(ds.ManufactureMonth, -1)], dtype=np.int16),
)
ds["ManufactureSensorType"] = (("traj"), gdp.cut_str(ds.ManufactureSensorType, 20))
ds["ManufactureSensorType"] = (
("traj"),
gdp.cut_str(ds.ManufactureSensorType, 20),
)
ds["ManufactureVoltage"] = (
("traj"),
np.array([gdp.str_to_float(ds.ManufactureVoltage[:-6], -1)], dtype=np.int16),
np.array([gdp.str_to_float(ds.ManufactureVoltage[:-2], -1)], dtype=np.int16),
) # e.g. 56 V
ds["FloatDiameter"] = (
("traj"),
Expand All @@ -270,12 +282,18 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
("traj"),
[gdp.str_to_float(ds.DragAreaOfDrogue[:-4])],
) # e.g. 416.6 m^2
ds["DragAreaRatio"] = (("traj"), [gdp.str_to_float(ds.DragAreaRatio)]) # e.g. 39.08
ds["DragAreaRatio"] = (
("traj"),
[gdp.str_to_float(ds.DragAreaRatio)],
) # e.g. 39.08
ds["DrogueCenterDepth"] = (
("traj"),
[gdp.str_to_float(ds.DrogueCenterDepth[:-2])],
) # e.g. 20.0 m
ds["DrogueDetectSensor"] = (("traj"), gdp.cut_str(ds.DrogueDetectSensor, 20))
ds["DrogueDetectSensor"] = (
("traj"),
gdp.cut_str(ds.DrogueDetectSensor, 20),
)

# vars attributes
vars_attrs = {
Expand Down Expand Up @@ -581,21 +599,22 @@ def to_raggedarray(
ra = RaggedArray.from_files(
indices=ids,
preprocess_func=preprocess,
coord_dim_map=gdp.GDP_COORDS,
name_coords=gdp.GDP_COORDS,
name_meta=gdp.GDP_METADATA,
name_data=GDP_DATA,
name_dims=gdp.GDP_DIMS,
rowsize_func=gdp.rowsize,
filename_pattern=filename_pattern,
tmp_path=tmp_path,
)

# set dynamic global attributes
if ra.attrs_global:
ra.attrs_global[
"time_coverage_start"
] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
ra.attrs_global[
"time_coverage_end"
] = f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
ra.attrs_global["time_coverage_start"] = (
f"{datetime(1970,1,1) + timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
)
ra.attrs_global["time_coverage_end"] = (
f"{datetime(1970,1,1) + timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
)

return ra
66 changes: 47 additions & 19 deletions clouddrift/adapters/gdp6h.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

GDP_VERSION = "September 2023"

GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/6h/"
GDP_DATA_URL = "https://www.aoml.noaa.gov/ftp/pub/phod/buoydata/6h"
GDP_TMP_PATH = os.path.join(tempfile.gettempdir(), "clouddrift", "gdp6h")
GDP_DATA = [
"lon",
Expand Down Expand Up @@ -82,7 +82,7 @@ def download(
string = urlpath.read().decode("utf-8")
filelist = list(set(re.compile(pattern).findall(string)))
for f in filelist:
did = int(f[:-3].split("_")[2])
did = int(f.split("_")[2].removesuffix(".nc"))
if (drifter_ids is None or did in drifter_ids) and did not in added:
drifter_urls.append(f"{url}/{dir}/{f}")
added.add(did)
Expand Down Expand Up @@ -187,7 +187,10 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
warnings.warn(f"Variable {var} not found in upstream data; skipping.")

# new variables
ds["ids"] = (["traj", "obs"], [np.repeat(ds.ID.values, ds.sizes["obs"])])
ds["ids"] = (
["traj", "obs"],
[np.repeat(ds.ID.values, ds.sizes["obs"])],
)
ds["drogue_status"] = (
["traj", "obs"],
[gdp.drogue_presence(ds.drogue_lost_date.data, ds.time.data[0])],
Expand All @@ -199,17 +202,32 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
[False if ds.get("location_type") == "Argos" else True],
) # 0 for Argos, 1 for GPS
ds["DeployingShip"] = (("traj"), gdp.cut_str(ds.DeployingShip, 20))
ds["DeploymentStatus"] = (("traj"), gdp.cut_str(ds.DeploymentStatus, 20))
ds["BuoyTypeManufacturer"] = (("traj"), gdp.cut_str(ds.BuoyTypeManufacturer, 20))
ds["BuoyTypeSensorArray"] = (("traj"), gdp.cut_str(ds.BuoyTypeSensorArray, 20))
ds["DeploymentStatus"] = (
("traj"),
gdp.cut_str(ds.DeploymentStatus, 20),
)
ds["BuoyTypeManufacturer"] = (
("traj"),
gdp.cut_str(ds.BuoyTypeManufacturer, 20),
)
ds["BuoyTypeSensorArray"] = (
("traj"),
gdp.cut_str(ds.BuoyTypeSensorArray, 20),
)
ds["CurrentProgram"] = (
("traj"),
[np.int32(gdp.str_to_float(ds.CurrentProgram, -1))],
)
ds["PurchaserFunding"] = (("traj"), gdp.cut_str(ds.PurchaserFunding, 20))
ds["PurchaserFunding"] = (
("traj"),
gdp.cut_str(ds.PurchaserFunding, 20),
)
ds["SensorUpgrade"] = (("traj"), gdp.cut_str(ds.SensorUpgrade, 20))
ds["Transmissions"] = (("traj"), gdp.cut_str(ds.Transmissions, 20))
ds["DeployingCountry"] = (("traj"), gdp.cut_str(ds.DeployingCountry, 20))
ds["DeployingCountry"] = (
("traj"),
gdp.cut_str(ds.DeployingCountry, 20),
)
ds["DeploymentComments"] = (
("traj"),
gdp.cut_str(
Expand All @@ -224,10 +242,13 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
("traj"),
[np.int16(gdp.str_to_float(ds.ManufactureMonth, -1))],
)
ds["ManufactureSensorType"] = (("traj"), gdp.cut_str(ds.ManufactureSensorType, 20))
ds["ManufactureSensorType"] = (
("traj"),
gdp.cut_str(ds.ManufactureSensorType, 20),
)
ds["ManufactureVoltage"] = (
("traj"),
[np.int16(gdp.str_to_float(ds.ManufactureVoltage[:-6], -1))],
[np.int16(gdp.str_to_float(ds.ManufactureVoltage[:-2], -1))],
) # e.g. 56 V
ds["FloatDiameter"] = (
("traj"),
Expand All @@ -254,12 +275,18 @@ def preprocess(index: int, **kwargs) -> xr.Dataset:
("traj"),
[gdp.str_to_float(ds.DragAreaOfDrogue[:-4])],
) # e.g. 416.6 m^2
ds["DragAreaRatio"] = (("traj"), [gdp.str_to_float(ds.DragAreaRatio)]) # e.g. 39.08
ds["DragAreaRatio"] = (
("traj"),
[gdp.str_to_float(ds.DragAreaRatio)],
) # e.g. 39.08
ds["DrogueCenterDepth"] = (
("traj"),
[gdp.str_to_float(ds.DrogueCenterDepth[:-2])],
) # e.g. 20.0 m
ds["DrogueDetectSensor"] = (("traj"), gdp.cut_str(ds.DrogueDetectSensor, 20))
ds["DrogueDetectSensor"] = (
("traj"),
gdp.cut_str(ds.DrogueDetectSensor, 20),
)

# vars attributes
vars_attrs = {
Expand Down Expand Up @@ -481,20 +508,21 @@ def to_raggedarray(
ra = RaggedArray.from_files(
indices=ids,
preprocess_func=preprocess,
coord_dim_map=gdp.GDP_COORDS,
name_coords=gdp.GDP_COORDS,
name_meta=gdp.GDP_METADATA,
name_data=GDP_DATA,
name_dims=gdp.GDP_DIMS,
rowsize_func=gdp.rowsize,
filename_pattern="drifter_6h_{id}.nc",
tmp_path=tmp_path,
)

# update dynamic global attributes
ra.attrs_global[
"time_coverage_start"
] = f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
ra.attrs_global[
"time_coverage_end"
] = f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
ra.attrs_global["time_coverage_start"] = (
f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.min(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
)
ra.attrs_global["time_coverage_end"] = (
f"{datetime.datetime(1970,1,1) + datetime.timedelta(seconds=int(np.max(ra.coords['time']))):%Y-%m-%d:%H:%M:%SZ}"
)

return ra
1 change: 1 addition & 0 deletions clouddrift/adapters/glad.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
---------
Özgökmen, Tamay. 2013. GLAD experiment CODE-style drifter trajectories (low-pass filtered, 15 minute interval records), northern Gulf of Mexico near DeSoto Canyon, July-October 2012. Distributed by: Gulf of Mexico Research Initiative Information and Data Cooperative (GRIIDC), Harte Research Institute, Texas A&M University–Corpus Christi. doi:10.7266/N7VD6WC8
"""

from io import BytesIO

import numpy as np
Expand Down
1 change: 1 addition & 0 deletions clouddrift/adapters/mosaic.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
>>> from clouddrift.adapters import mosaic
>>> ds = mosaic.to_xarray()
"""

import xml.etree.ElementTree as ET
from datetime import datetime
from io import BytesIO
Expand Down
4 changes: 2 additions & 2 deletions clouddrift/adapters/subsurface_floats.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
This module defines functions to adapt as a ragged-array dataset a collection of data
from 2193 trajectories of SOFAR, APEX, and RAFOS subsurface floats from 52 experiments
This module defines functions to adapt as a ragged-array dataset a collection of data
from 2193 trajectories of SOFAR, APEX, and RAFOS subsurface floats from 52 experiments
across the world between 1989 and 2015.
The dataset is hosted at https://www.aoml.noaa.gov/phod/float_traj/index.php
Expand Down
6 changes: 3 additions & 3 deletions clouddrift/adapters/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ def download_with_progress(
retry_protocol = custom_retry_protocol # type: ignore

executor = concurrent.futures.ThreadPoolExecutor()
futures: dict[
concurrent.futures.Future, Tuple[str, Union[BufferedIOBase, str]]
] = dict()
futures: dict[concurrent.futures.Future, Tuple[str, Union[BufferedIOBase, str]]] = (
dict()
)
bar = None

for src, dst, exp_size in download_map:
Expand Down
11 changes: 6 additions & 5 deletions clouddrift/adapters/yomaha.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
This module defines functions used to adapt the YoMaHa'07: Velocity data assessed
from trajectories of Argo floats at parking level and at the sea surface as
a ragged-arrays dataset.
This module defines functions used to adapt the YoMaHa'07: Velocity data assessed
from trajectories of Argo floats at parking level and at the sea surface as
a ragged-arrays dataset.
The dataset is hosted at http://apdrc.soest.hawaii.edu/projects/yomaha/ and the user manual
is available at http://apdrc.soest.hawaii.edu/projects/yomaha/yomaha07/YoMaHa070612.pdf.
Expand Down Expand Up @@ -52,7 +52,7 @@ def download(tmp_path: str):
download_with_progress(download_requests)

filename_gz = f"{tmp_path}/{YOMAHA_URLS[-1].split('/')[-1]}"
filename = filename_gz[:-3]
filename = filename_gz.removesuffix(".gz")

buffer = BytesIO()
download_with_progress([(YOMAHA_URLS[-1], buffer, None)])
Expand Down Expand Up @@ -153,7 +153,8 @@ def to_xarray(tmp_path: Union[str, None] = None):
)

# open with pandas
filename = f"{tmp_path}/{YOMAHA_URLS[-1].split('/')[-1][:-3]}"
filename_gz = f"{tmp_path}/{YOMAHA_URLS[-1].split('/')[-1]}"
filename = filename_gz.removesuffix(".gz")
df = pd.read_csv(
filename, names=col_names, sep=r"\s+", header=None, na_values=na_col
)
Expand Down
5 changes: 3 additions & 2 deletions clouddrift/datasets.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""
This module provides functions to easily access ragged array datasets. If the datasets are
This module provides functions to easily access ragged array datasets. If the datasets are
not accessed via cloud storage platforms or are not found on the local filesystem,
they will be downloaded from their upstream repositories and stored for later access
they will be downloaded from their upstream repositories and stored for later access
(~/.clouddrift for UNIX-based systems).
"""

import os
import platform
from io import BytesIO
Expand Down
1 change: 1 addition & 0 deletions clouddrift/pairs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Functions to analyze pairs of contiguous data segments.
"""

import itertools
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Optional, Tuple, Union
Expand Down
Loading

0 comments on commit 749520b

Please sign in to comment.