Skip to content

Commit

Permalink
dask and hdf5 array interfaces
Browse files Browse the repository at this point in the history
  • Loading branch information
sneakers-the-rat committed Apr 9, 2024
1 parent a6391c0 commit 46060c1
Show file tree
Hide file tree
Showing 18 changed files with 330 additions and 37 deletions.
2 changes: 2 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
"linkml": ("https://linkml.io/linkml/", None),
"linkml_runtime": ("https://linkml.io/linkml/", None),
"linkml-runtime": ("https://linkml.io/linkml/", None),
"dask": ("https://docs.dask.org/en/stable/", None),
"h5py": ("https://docs.h5py.org/en/stable/", None),
}

# -- Options for HTML output -------------------------------------------------
Expand Down
26 changes: 5 additions & 21 deletions pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ dependencies = [
"pydantic>=2.3.0",
"nptyping>=2.5.0",
"blosc2<3.0.0,>=2.5.1",
"numpy>=1.24.0",
]
requires-python = "<4.0,>=3.9"
readme = "README.md"
Expand All @@ -17,7 +18,7 @@ license = {text = "MIT"}

[project.optional-dependencies]
dask = [
"dask[array]>=2024.1.1"
"dask>=2024.4.0",
]
hdf5 = [
"h5py>=3.10.0"
Expand Down Expand Up @@ -99,9 +100,11 @@ select = [

]
ignore = [
"ANN101", "ANN102", "ANN401",
"ANN101", "ANN102", "ANN401", "ANN204",
# builtin type annotations
"UP006", "UP035",
# | for Union types (only supported >=3.10
"UP007", "UP038",
# docstrings for __init__
"D107",
]
Expand Down
5 changes: 4 additions & 1 deletion src/numpydantic/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
apply_patches()

from numpydantic.ndarray import NDArray

from numpydantic.meta import update_ndarray_stub

from nptyping import Shape

update_ndarray_stub()

__all__ = ["NDArray", "Shape"]
4 changes: 3 additions & 1 deletion src/numpydantic/interface/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from numpydantic.interface.dask import DaskInterface
from numpydantic.interface.hdf5 import H5Interface
from numpydantic.interface.interface import Interface
from numpydantic.interface.numpy import NumpyInterface

__all__ = ["Interface", "NumpyInterface"]
__all__ = ["Interface", "DaskInterface", "H5Interface", "NumpyInterface"]
30 changes: 30 additions & 0 deletions src/numpydantic/interface/dask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import Any
from numpydantic.interface.interface import Interface

try:
from dask.array.core import Array as DaskArray
except ImportError:
DaskArray = None


class DaskInterface(Interface):
"""
Interface for Dask :class:`~dask.array.core.Array`
"""

input_types = (DaskArray,)
return_type = DaskArray

@classmethod
def check(cls, array: Any) -> bool:
"""
check if array is a dask array
"""
if DaskArray is not None and isinstance(array, DaskArray):
return True
return False

@classmethod
def enabled(cls) -> bool:
"""check if we successfully imported dask"""
return DaskArray is not None
143 changes: 143 additions & 0 deletions src/numpydantic/interface/hdf5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from pathlib import Path
from typing import Any, NamedTuple, Tuple, Union, TypeAlias

import numpy as np

from numpydantic.interface.interface import Interface
from numpydantic.types import NDArrayType

try:
import h5py
except ImportError:
h5py = None

H5Arraylike: TypeAlias = Tuple[Union[Path, str], str]


class H5Array(NamedTuple):
"""Location specifier for arrays within an HDF5 file"""

file: Union[Path, str]
"""Location of HDF5 file"""
path: str
"""Path within the HDF5 file"""


class H5Proxy:
"""
Proxy class to mimic numpy-like array behavior with an HDF5 array
The attribute and item access methods only open the file for the duration of the method,
making it less perilous to share this object between threads and processes.
This class attempts to be a passthrough class to a :class:`h5py.Dataset` object,
including its attributes and item getters/setters.
When using read-only methods, no locking is attempted (beyond the HDF5 defaults),
but when using the write methods (setting an array value), try and use the ``locking``
methods of :class:`h5py.File` .
Args:
file (pathlib.Path | str): Location of hdf5 file on filesystem
path (str): Path to array within hdf5 file
"""

def __init__(self, file: Union[Path, str], path: str):
self.file = Path(file)
self.path = path

def array_exists(self) -> bool:
"""Check that there is in fact an array at :attr:`.path` within :attr:`.file`"""
with h5py.File(self.file, "r") as h5f:
obj = h5f.get(self.path)
return obj is not None

@classmethod
def from_h5array(cls, h5array: H5Array) -> "H5Proxy":
"""Instantiate using :class:`.H5Array`"""
return H5Proxy(file=h5array.file, path=h5array.path)

def __getattr__(self, item: str):
with h5py.File(self.file, "r") as h5f:
obj = h5f.get(self.path)
return getattr(obj, item)

def __getitem__(self, item: Union[int, slice]) -> np.ndarray:
with h5py.File(self.file, "r") as h5f:
obj = h5f.get(self.path)
return obj[item]

def __setitem__(self, key: Union[int, slice], value: Union[int, float, np.ndarray]):
with h5py.File(self.file, "r+", locking=True) as h5f:
obj = h5f.get(self.path)
obj[key] = value


class H5Interface(Interface):
"""
Interface for Arrays stored as datasets within an HDF5 file.
Takes a :class:`.H5Array` specifier to select a :class:`h5py.Dataset` from a
:class:`h5py.File` and returns a :class:`.H5Proxy` class that acts like a
passthrough numpy-like interface to the dataset.
"""

input_types = (
H5Array,
H5Arraylike,
)
return_type = H5Proxy

@classmethod
def enabled(cls) -> bool:
"""Check whether h5py can be imported"""
return h5py is not None

@classmethod
def check(cls, array: Union[H5Array, Tuple[Union[Path, str], str]]) -> bool:
"""Check that the given array is a :class:`.H5Array` or something that resembles one."""
if isinstance(array, H5Array):
return True

if isinstance(array, (tuple, list)) and len(array) == 2:
# check that the first arg is an hdf5 file
try:
file = Path(array[0])
except TypeError:
# not a path, we don't apply.
return False

if not file.exists():
return False

# hdf5 files are commonly given odd suffixes,
# so we just try and open it and see what happens
try:
with h5py.File(file, "r"):
# don't check that the array exists and raise here,
# this check is just for whether the validator applies or not.
pass
return True
except (FileNotFoundError, OSError):
return False

return False

def before_validation(self, array: Any) -> NDArrayType:
"""Create an :class:`.H5Proxy` to use throughout validation"""
if isinstance(array, H5Array):
array = H5Proxy.from_h5array(h5array=array)
elif isinstance(array, (tuple, list)) and len(array) == 2:
array = H5Proxy(file=array[0], path=array[1])
else:
raise ValueError(
"Need to specify a file and a path within an HDF5 file to use the HDF5 Interface"
)

if not array.array_exists():
raise ValueError(
f"HDF5 file located at {array.file}, "
f"but no array found at {array.path}"
)

return array
15 changes: 14 additions & 1 deletion src/numpydantic/interface/interface.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
from operator import attrgetter
from typing import Any, Generic, List, Type, TypeVar, Tuple
from typing import Any, Generic, Tuple, Type, TypeVar

from nptyping.shape_expression import check_shape

Expand All @@ -15,6 +15,7 @@ class Interface(ABC, Generic[T]):
Abstract parent class for interfaces to different array formats
"""

input_types: Tuple[Any, ...]
return_type: Type[T]
priority: int = 0

Expand Down Expand Up @@ -109,6 +110,18 @@ def array_types(cls) -> Tuple[NDArrayType, ...]:
"""Return types for all enabled interfaces"""
return tuple([i.return_type for i in cls.interfaces()])

@classmethod
def input_types(cls) -> Tuple[Any, ...]:
"""Input types for all enabled interfaces"""
in_types = []
for iface in cls.interfaces():
if isinstance(iface.input_types, tuple | list):
in_types.extend(iface.input_types)
else:
in_types.append(iface.input_types)

return tuple(in_types)

@classmethod
def match(cls, array: Any) -> Type["Interface"]:
"""
Expand Down
1 change: 1 addition & 0 deletions src/numpydantic/interface/numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class NumpyInterface(Interface):
Numpy :class:`~numpy.ndarray` s!
"""

input_types = (ndarray, list)
return_type = ndarray

@classmethod
Expand Down
23 changes: 17 additions & 6 deletions src/numpydantic/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
"""

from pathlib import Path
from warnings import warn

from numpydantic.interface import Interface

_BUILTIN_IMPORTS = ("import typing", "import pathlib")


def generate_ndarray_stub() -> str:
"""
Expand All @@ -14,11 +17,16 @@ def generate_ndarray_stub() -> str:

import_strings = [
f"from {arr.__module__} import {arr.__name__}"
for arr in Interface.array_types()
for arr in Interface.input_types()
if arr.__module__ != "builtins"
]
import_strings.extend(_BUILTIN_IMPORTS)
import_string = "\n".join(import_strings)

class_names = [arr.__name__ for arr in Interface.array_types()]
class_names = [
arr.__name__ if arr.__module__ != "typing" else str(arr)
for arr in Interface.input_types()
]
class_union = " | ".join(class_names)
ndarray_type = "NDArray = " + class_union

Expand All @@ -32,8 +40,11 @@ def update_ndarray_stub() -> None:
"""
from numpydantic import ndarray

stub_string = generate_ndarray_stub()
try:
stub_string = generate_ndarray_stub()

pyi_file = Path(ndarray.__file__).with_suffix(".pyi")
with open(pyi_file, "w") as pyi:
pyi.write(stub_string)
pyi_file = Path(ndarray.__file__).with_suffix(".pyi")
with open(pyi_file, "w") as pyi:
pyi.write(stub_string)
except Exception as e:
warn(f"ndarray.pyi stub file could not be generated: {e}", stacklevel=1)
3 changes: 0 additions & 3 deletions src/numpydantic/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,6 @@ class NDArray(NPTypingType, metaclass=NDArrayMeta):
- https://docs.pydantic.dev/latest/usage/types/custom/#handling-third-party-types
"""

def __init__(self: T):
pass

__args__: Tuple[ShapeType, DtypeType] = (Any, Any)

@classmethod
Expand Down
Loading

0 comments on commit 46060c1

Please sign in to comment.