Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix chunking bug with compound dtypes #1146

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/dev-testing.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
name: Dev Branch Testing
on:
workflow_dispatch:
inputs:
python-versions:
description: 'List of Python versions to use in matrix, as JSON string'
required: true
type: string
workflow_call:
inputs:
python-versions:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
import numpy as np
import zarr
from hdmf import Container
from hdmf.build.builders import (
BaseBuilder,
LinkBuilder,
)
from hdmf.utils import get_data_shape
from pydantic import (
BaseModel,
Expand Down Expand Up @@ -244,7 +248,9 @@ def model_json_schema(cls, **kwargs) -> dict[str, Any]:
return super().model_json_schema(mode="validation", schema_generator=PureJSONSchemaGenerator, **kwargs)

@classmethod
def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Literal["data", "timestamps"]) -> Self:
def from_neurodata_object(
cls, neurodata_object: Container, dataset_name: Literal["data", "timestamps"], builder: BaseBuilder
) -> Self:
"""
Construct an instance of a DatasetIOConfiguration for a dataset in a neurodata object in an NWBFile.

Expand All @@ -256,11 +262,17 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera
The name of the field that will become a dataset when written to disk.
Some neurodata objects can have multiple such fields, such as `pynwb.TimeSeries` which can have both `data`
and `timestamps`, each of which can be configured separately.
builder : hdmf.build.builders.BaseBuilder
The builder object that would be used to construct the NWBFile object.
"""
location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name)

candidate_dataset = getattr(neurodata_object, dataset_name)
full_shape = get_data_shape(data=candidate_dataset)

if has_compound_dtype(builder, location_in_file):
full_shape = (len(candidate_dataset),)
else:
full_shape = get_data_shape(data=candidate_dataset)

dtype = _infer_dtype(dataset=candidate_dataset)

if isinstance(candidate_dataset, GenericDataChunkIterator):
Expand Down Expand Up @@ -312,3 +324,123 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera
buffer_shape=buffer_shape,
compression_method=compression_method,
)


def has_compound_dtype(builder: BaseBuilder, location_in_file: str) -> bool:
"""
Determine if the dataset at the given location in the file has a compound dtype.

Parameters
----------
builder : hdmf.build.builders.BaseBuilder
The builder object that would be used to construct the NWBFile object.
location_in_file : str
The location of the dataset within the NWBFile, e.g. 'acquisition/ElectricalSeries/data'.

Returns
-------
bool
Whether the dataset has a compound dtype.
"""
dataset_builder = get_dataset_builder(builder, location_in_file)
return isinstance(dataset_builder.dtype, list)


def get_dataset_builder(builder, location_in_file):
"""Find the appropriate sub-builder for the dataset at the given location in the file.

This function will traverse the groups in the location_in_file until it reaches a DatasetBuilder,
and then return that builder.

Parameters
----------
builder : hdmf.build.builders.BaseBuilder
The builder object that would be used to construct the NWBFile object.
location_in_file : str
The location of the dataset within the NWBFile, e.g. 'acquisition/ElectricalSeries/data'.

Returns
-------
hdmf.build.builders.BaseBuilder
The builder object for the dataset at the given location.

Raises
------
ValueError
If the location_in_file is not found in the builder.

Notes
-----
Items in defined top-level places like electrodes may not be in the groups of the nwbfile-level builder,
but rather in hidden locations like general/extracellular_ephys/electrodes.
Also, some items in these top-level locations may interrupt the order of the location_in_file.
For example, when location_in_file is 'stimulus/AcousticWaveformSeries/data', the builder for that dataset is
located at 'stimulus/presentation/AcousticWaveformSeries/data'.
For this reason, we recursively search for the appropriate sub-builder for each name in the location_in_file.
Also, the first name in location_in_file is inherently suspect due to the way that the location is determined
in _find_location_in_memory_nwbfile(), and may not be present in the builder. For example, when location_in_file is
'lab_meta_data/fiber_photometry/fiber_photometry_table/location/data', the builder for that dataset is located at
'general/fiber_photometry/fiber_photometry_table/location/data'.
"""
split_location = iter(location_in_file.split("/"))
name = next(split_location)

if _find_sub_builder(builder, name) is None:
name = next(split_location)

while name not in builder.datasets and name not in builder.links:
builder = _find_sub_builder(builder, name)
if builder is None:
raise ValueError(f"Could not find location '{location_in_file}' in builder ({name} is missing).")
try:
name = next(split_location)
except StopIteration:
raise ValueError(f"Could not find location '{location_in_file}' in builder ({name} is not a dataset).")
builder = builder[name]
if isinstance(builder, LinkBuilder):
builder = builder.builder
return builder


def _find_sub_builder(builder: BaseBuilder, name: str) -> BaseBuilder:
"""Search breadth-first for a sub-builder by name in a builder object.

Parameters
----------
builder : hdmf.build.builders.BaseBuilder
The builder object to search for the sub-builder in.
name : str
The name of the sub-builder to search for.

Returns
-------
hdmf.build.builders.BaseBuilder
The sub-builder with the given name, or None if it could not be found.
"""
sub_builders = list(builder.groups.values())
return _recursively_search_sub_builders(sub_builders=sub_builders, name=name)


def _recursively_search_sub_builders(sub_builders: list[BaseBuilder], name: str) -> BaseBuilder:
"""Recursively search for a sub-builder by name in a list of sub-builders.

Parameters
----------
sub_builders : list[hdmf.build.builders.BaseBuilder]
The list of sub-builders to search for the sub-builder in.
name : str
The name of the sub-builder to search for.

Returns
-------
hdmf.build.builders.BaseBuilder
The sub-builder with the given name, or None if it could not be found.
"""
sub_sub_builders = []
for sub_builder in sub_builders:
if sub_builder.name == name:
return sub_builder
sub_sub_builders.extend(list(sub_builder.groups.values()))
if len(sub_sub_builders) == 0:
return None
return _recursively_search_sub_builders(sub_builders=sub_sub_builders, name=name)
8 changes: 5 additions & 3 deletions src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from hdmf.data_utils import DataIO
from hdmf.utils import get_data_shape
from hdmf_zarr import NWBZarrIO
from pynwb import NWBHDF5IO, NWBFile
from pynwb import NWBHDF5IO, NWBFile, get_manager
from pynwb.base import DynamicTable, TimeSeriesReferenceVectorData
from pynwb.file import NWBContainer

Expand Down Expand Up @@ -102,6 +102,8 @@ def get_default_dataset_io_configurations(
)

known_dataset_fields = ("data", "timestamps")
manager = get_manager()
builder = manager.build(nwbfile)
for neurodata_object in nwbfile.objects.values():
if isinstance(neurodata_object, DynamicTable):
dynamic_table = neurodata_object # For readability
Expand Down Expand Up @@ -134,7 +136,7 @@ def get_default_dataset_io_configurations(
continue

dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
neurodata_object=column, dataset_name=dataset_name
neurodata_object=column, dataset_name=dataset_name, builder=builder
)

yield dataset_io_configuration
Expand Down Expand Up @@ -168,7 +170,7 @@ def get_default_dataset_io_configurations(
continue

dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
neurodata_object=neurodata_object, dataset_name=known_dataset_field
neurodata_object=neurodata_object, dataset_name=known_dataset_field, builder=builder
)

yield dataset_io_configuration
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np
import pytest
from pynwb import get_manager
from pynwb.testing.mock.file import mock_NWBFile

from neuroconv.tools.nwb_helpers import DatasetIOConfiguration
Expand Down Expand Up @@ -89,8 +90,12 @@ def get_data_io_kwargs(self):
data = np.array(["test", "string", "abc"], dtype=object)
nwbfile.add_trial_column(name="test", description="test column with object dtype but all strings", data=data)
neurodata_object = nwbfile.trials.columns[2]
manager = get_manager()
builder = manager.build(nwbfile)

dataset_io_configuration = TestDatasetIOConfiguration.from_neurodata_object(neurodata_object, dataset_name="data")
dataset_io_configuration = TestDatasetIOConfiguration.from_neurodata_object(
neurodata_object, dataset_name="data", builder=builder
)

assert dataset_io_configuration.chunk_shape == (3,)
assert dataset_io_configuration.buffer_shape == (3,)
Expand Down
Loading