Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update to anndata 0.11 and memory efficient reads + writes #1152

Draft
wants to merge 16 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 44 additions & 13 deletions cellxgene_schema_cli/cellxgene_schema/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
from typing import Dict, List, Union

import anndata as ad
import h5py
import numpy as np
from anndata.compat import DaskArray
from anndata.experimental import read_dispatched, read_elem_as_dask
from cellxgene_ontology_guide.ontology_parser import OntologyParser
from scipy import sparse
from xxhash import xxh3_64_intdigest
Expand Down Expand Up @@ -68,7 +71,7 @@
return adata


def get_matrix_format(adata: ad.AnnData, matrix: Union[np.ndarray, sparse.spmatrix]) -> str:
def get_matrix_format(matrix: DaskArray) -> str:
"""
Given a matrix, returns the format as one of: csc, csr, coo, dense
or unknown.
Expand All @@ -84,15 +87,11 @@
# >>> return getattr(matrix, "format_str", "dense)
#
matrix_format = "unknown"
if adata.n_obs == 0 or adata.n_vars == 0:
matrix_slice = matrix[0:1, 0:1].compute()
if isinstance(matrix_slice, sparse.spmatrix):
matrix_format = matrix_slice.format
elif isinstance(matrix_slice, np.ndarray):
matrix_format = "dense"
else:
matrix_slice = matrix[0:1, 0:1]
if isinstance(matrix_slice, sparse.spmatrix):
matrix_format = matrix_slice.format
elif isinstance(matrix_slice, np.ndarray):
matrix_format = "dense"

assert matrix_format in ["unknown", "csr", "csc", "coo", "dense"]
return matrix_format

Expand All @@ -116,21 +115,53 @@
return getattr(adata, attr)


def read_h5ad(h5ad_path: Union[str, bytes, os.PathLike]) -> ad.AnnData:
def read_backed(f: h5py.File, chunk_size: int) -> ad.AnnData:
"""
Read an AnnData object from a h5py.File object, reading in matrices (dense or sparse) as dask arrays. Does not
read full matrices into memory.

:param f: h5py.File object
:param chunk_size: size of chunks to read matrices in
:return: ad.AnnData object
"""

def callback(func, elem_name: str, elem, iospec):
if "/layers" in elem_name or elem_name == "/X" or elem_name == "/raw/X":
if iospec.encoding_type in (
"csr_matrix",
"csc_matrix",
):
n_vars = elem.attrs.get("shape")[1]
return read_elem_as_dask(elem, chunks=(chunk_size, n_vars))
elif iospec.encoding_type == "array" and len(elem.shape) == 2:
n_vars = elem.shape[1]
return read_elem_as_dask(elem, chunks=(chunk_size, n_vars))

Check warning on line 138 in cellxgene_schema_cli/cellxgene_schema/utils.py

View check run for this annotation

Codecov / codecov/patch

cellxgene_schema_cli/cellxgene_schema/utils.py#L137-L138

Added lines #L137 - L138 were not covered by tests
else:
return func(elem)
else:
return func(elem)

adata = read_dispatched(f, callback=callback)

return adata


def read_h5ad(h5ad_path: Union[str, bytes, os.PathLike], chunk_size: int = 10_000) -> ad.AnnData:
"""
Reads h5ad into adata
:params Union[str, bytes, os.PathLike] h5ad_path: path to h5ad to read

:rtype None
"""
try:
adata = ad.read_h5ad(h5ad_path, backed="r")
f = h5py.File(h5ad_path)
adata = read_backed(f, chunk_size)

# This code, and AnnData in general, is optimized for row access.
# Running backed, with CSC, is prohibitively slow. Read the entire
# AnnData into memory if it is CSC.
if (get_matrix_format(adata, adata.X) == "csc") or (
(adata.raw is not None) and (get_matrix_format(adata, adata.raw.X) == "csc")
if (get_matrix_format(adata.X) == "csc") or (
(adata.raw is not None) and (get_matrix_format(adata.raw.X) == "csc")
):
logger.warning("Matrices are in CSC format; loading entire dataset into memory.")
adata = adata.to_memory()
Expand Down
Loading
Loading