catalystneuro · pauladkisson · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/.github/workflows/dev-testing.yml b/.github/workflows/dev-testing.yml
@@ -1,6 +1,11 @@
 name: Dev Branch Testing
 on:
   workflow_dispatch:
+    inputs:
+      python-versions:
+        description: 'List of Python versions to use in matrix, as JSON string'
+        required: true
+        type: string
   workflow_call:
     inputs:
       python-versions:

diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
@@ -9,6 +9,10 @@
 import numpy as np
 import zarr
 from hdmf import Container
+from hdmf.build.builders import (
+    BaseBuilder,
+    LinkBuilder,
+)
 from hdmf.utils import get_data_shape
 from pydantic import (
     BaseModel,
@@ -244,7 +248,9 @@ def model_json_schema(cls, **kwargs) -> dict[str, Any]:
         return super().model_json_schema(mode="validation", schema_generator=PureJSONSchemaGenerator, **kwargs)
 
     @classmethod
-    def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Literal["data", "timestamps"]) -> Self:
+    def from_neurodata_object(
+        cls, neurodata_object: Container, dataset_name: Literal["data", "timestamps"], builder: BaseBuilder
+    ) -> Self:
         """
         Construct an instance of a DatasetIOConfiguration for a dataset in a neurodata object in an NWBFile.
 
@@ -256,11 +262,17 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera
             The name of the field that will become a dataset when written to disk.
             Some neurodata objects can have multiple such fields, such as `pynwb.TimeSeries` which can have both `data`
             and `timestamps`, each of which can be configured separately.
+        builder : hdmf.build.builders.BaseBuilder
+            The builder object that would be used to construct the NWBFile object.
         """
         location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name)
-
         candidate_dataset = getattr(neurodata_object, dataset_name)
-        full_shape = get_data_shape(data=candidate_dataset)
+
+        if has_compound_dtype(builder, location_in_file):
+            full_shape = (len(candidate_dataset),)
+        else:
+            full_shape = get_data_shape(data=candidate_dataset)
+
         dtype = _infer_dtype(dataset=candidate_dataset)
 
         if isinstance(candidate_dataset, GenericDataChunkIterator):
@@ -312,3 +324,123 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera
             buffer_shape=buffer_shape,
             compression_method=compression_method,
         )
+
+
+def has_compound_dtype(builder: BaseBuilder, location_in_file: str) -> bool:
+    """
+    Determine if the dataset at the given location in the file has a compound dtype.
+
+    Parameters
+    ----------
+    builder : hdmf.build.builders.BaseBuilder
+        The builder object that would be used to construct the NWBFile object.
+    location_in_file : str
+        The location of the dataset within the NWBFile, e.g. 'acquisition/ElectricalSeries/data'.
+
+    Returns
+    -------
+    bool
+        Whether the dataset has a compound dtype.
+    """
+    dataset_builder = get_dataset_builder(builder, location_in_file)
+    return isinstance(dataset_builder.dtype, list)
+
+
+def get_dataset_builder(builder, location_in_file):
+    """Find the appropriate sub-builder for the dataset at the given location in the file.
+
+    This function will traverse the groups in the location_in_file until it reaches a DatasetBuilder,
+    and then return that builder.
+
+    Parameters
+    ----------
+    builder : hdmf.build.builders.BaseBuilder
+        The builder object that would be used to construct the NWBFile object.
+    location_in_file : str
+        The location of the dataset within the NWBFile, e.g. 'acquisition/ElectricalSeries/data'.
+
+    Returns
+    -------
+    hdmf.build.builders.BaseBuilder
+        The builder object for the dataset at the given location.
+
+    Raises
+    ------
+    ValueError
+        If the location_in_file is not found in the builder.
+
+    Notes
+    -----
+    Items in defined top-level places like electrodes may not be in the groups of the nwbfile-level builder,
+    but rather in hidden locations like general/extracellular_ephys/electrodes.
+    Also, some items in these top-level locations may interrupt the order of the location_in_file.
+    For example, when location_in_file is 'stimulus/AcousticWaveformSeries/data', the builder for that dataset is
+    located at 'stimulus/presentation/AcousticWaveformSeries/data'.
+    For this reason, we recursively search for the appropriate sub-builder for each name in the location_in_file.
+    Also, the first name in location_in_file is inherently suspect due to the way that the location is determined
+    in _find_location_in_memory_nwbfile(), and may not be present in the builder. For example, when location_in_file is
+    'lab_meta_data/fiber_photometry/fiber_photometry_table/location/data', the builder for that dataset is located at
+    'general/fiber_photometry/fiber_photometry_table/location/data'.
+    """
+    split_location = iter(location_in_file.split("/"))
+    name = next(split_location)
+
+    if _find_sub_builder(builder, name) is None:
+        name = next(split_location)
+
+    while name not in builder.datasets and name not in builder.links:
+        builder = _find_sub_builder(builder, name)
+        if builder is None:
+            raise ValueError(f"Could not find location '{location_in_file}' in builder ({name} is missing).")
+        try:
+            name = next(split_location)
+        except StopIteration:
+            raise ValueError(f"Could not find location '{location_in_file}' in builder ({name} is not a dataset).")
+    builder = builder[name]
+    if isinstance(builder, LinkBuilder):
+        builder = builder.builder
+    return builder
+
+
+def _find_sub_builder(builder: BaseBuilder, name: str) -> BaseBuilder:
+    """Search breadth-first for a sub-builder by name in a builder object.
+
+    Parameters
+    ----------
+    builder : hdmf.build.builders.BaseBuilder
+        The builder object to search for the sub-builder in.
+    name : str
+        The name of the sub-builder to search for.
+
+    Returns
+    -------
+    hdmf.build.builders.BaseBuilder
+        The sub-builder with the given name, or None if it could not be found.
+    """
+    sub_builders = list(builder.groups.values())
+    return _recursively_search_sub_builders(sub_builders=sub_builders, name=name)
+
+
+def _recursively_search_sub_builders(sub_builders: list[BaseBuilder], name: str) -> BaseBuilder:
+    """Recursively search for a sub-builder by name in a list of sub-builders.
+
+    Parameters
+    ----------
+    sub_builders : list[hdmf.build.builders.BaseBuilder]
+        The list of sub-builders to search for the sub-builder in.
+    name : str
+        The name of the sub-builder to search for.
+
+    Returns
+    -------
+    hdmf.build.builders.BaseBuilder
+        The sub-builder with the given name, or None if it could not be found.
+    """
+    sub_sub_builders = []
+    for sub_builder in sub_builders:
+        if sub_builder.name == name:
+            return sub_builder
+        sub_sub_builders.extend(list(sub_builder.groups.values()))
+    if len(sub_sub_builders) == 0:
+        return None
+    return _recursively_search_sub_builders(sub_builders=sub_sub_builders, name=name)
diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
@@ -9,7 +9,7 @@
 from hdmf.data_utils import DataIO
 from hdmf.utils import get_data_shape
 from hdmf_zarr import NWBZarrIO
-from pynwb import NWBHDF5IO, NWBFile
+from pynwb import NWBHDF5IO, NWBFile, get_manager
 from pynwb.base import DynamicTable, TimeSeriesReferenceVectorData
 from pynwb.file import NWBContainer
 
@@ -102,6 +102,8 @@ def get_default_dataset_io_configurations(
         )
 
     known_dataset_fields = ("data", "timestamps")
+    manager = get_manager()
+    builder = manager.build(nwbfile)
     for neurodata_object in nwbfile.objects.values():
         if isinstance(neurodata_object, DynamicTable):
             dynamic_table = neurodata_object  # For readability
@@ -134,7 +136,7 @@ def get_default_dataset_io_configurations(
                     continue
 
                 dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
-                    neurodata_object=column, dataset_name=dataset_name
+                    neurodata_object=column, dataset_name=dataset_name, builder=builder
                 )
 
                 yield dataset_io_configuration
@@ -168,7 +170,7 @@ def get_default_dataset_io_configurations(
                     continue
 
                 dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
-                    neurodata_object=neurodata_object, dataset_name=known_dataset_field
+                    neurodata_object=neurodata_object, dataset_name=known_dataset_field, builder=builder
                 )
 
                 yield dataset_io_configuration
diff --git a/...test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py b/...test_backend_and_dataset_configuration/test_models/test_dataset_io_configuration_model.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pytest
+from pynwb import get_manager
 from pynwb.testing.mock.file import mock_NWBFile
 
 from neuroconv.tools.nwb_helpers import DatasetIOConfiguration
@@ -89,8 +90,12 @@ def get_data_io_kwargs(self):
     data = np.array(["test", "string", "abc"], dtype=object)
     nwbfile.add_trial_column(name="test", description="test column with object dtype but all strings", data=data)
     neurodata_object = nwbfile.trials.columns[2]
+    manager = get_manager()
+    builder = manager.build(nwbfile)
 
-    dataset_io_configuration = TestDatasetIOConfiguration.from_neurodata_object(neurodata_object, dataset_name="data")
+    dataset_io_configuration = TestDatasetIOConfiguration.from_neurodata_object(
+        neurodata_object, dataset_name="data", builder=builder
+    )
 
     assert dataset_io_configuration.chunk_shape == (3,)
     assert dataset_io_configuration.buffer_shape == (3,)