Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(4.0): Add check for conditions that would cause anndata.write to fail. #643

Merged
merged 17 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions cellxgene_schema_cli/cellxgene_schema/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,16 @@
)
@click.option("-i", "--ignore-labels", help="Ignore ontology labels when validating", is_flag=True)
@click.option("-v", "--verbose", help="When present will set logging level to debug", is_flag=True)
def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose):
@click.option(

Check warning on line 35 in cellxgene_schema_cli/cellxgene_schema/cli.py

View check run for this annotation

Codecov / codecov/patch

cellxgene_schema_cli/cellxgene_schema/cli.py#L35

Added line #L35 was not covered by tests
"-w",
"--write-check",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jahilton I'd like to get your thoughts on this new flag before I merge.

help="When present will run write_check on of the data. This check if the dataset "
Bento007 marked this conversation as resolved.
Show resolved Hide resolved
"can be written back out with added labels. This is a slower, more memory and "
"disk intensive operation. It is not recommended to run this on dataset that will "
Bento007 marked this conversation as resolved.
Show resolved Hide resolved
"not fit in memory. This check will no longer be required with anndata 0.10.0.",
is_flag=True,
)
def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_check):

Check warning on line 44 in cellxgene_schema_cli/cellxgene_schema/cli.py

View check run for this annotation

Codecov / codecov/patch

cellxgene_schema_cli/cellxgene_schema/cli.py#L44

Added line #L44 was not covered by tests
# Imports are very slow so we defer loading until Click arg validation has passed

print("Loading dependencies")
Expand All @@ -44,7 +53,9 @@
print("Loading validator modules")
from .validate import validate

is_valid, _, _ = validate(h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose)
is_valid, _, _ = validate(

Check warning on line 56 in cellxgene_schema_cli/cellxgene_schema/cli.py

View check run for this annotation

Codecov / codecov/patch

cellxgene_schema_cli/cellxgene_schema/cli.py#L56

Added line #L56 was not covered by tests
h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose, write_check=write_check
)
if is_valid:
sys.exit(0)
else:
Expand Down
27 changes: 24 additions & 3 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,13 +1120,16 @@ def _deep_check(self):
"fixing current errors."
)

def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False) -> bool:
def validate_adata(
self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False, write_check: bool = False
) -> bool:
"""
Validates adata

:params Union[str, bytes, os.PathLike] h5ad_path: path to h5ad to validate, if None it will try to validate
from self.adata
:param to_memory: indicate if the h5ad should be read into memory.
:param write_check: indicate if write_check should be performed.

:return True if successful validation, False otherwise
:rtype bool
Expand All @@ -1148,6 +1151,9 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me
if not self.errors:
self._deep_check()

if write_check:
self.write_check()

# Print warnings if any
if self.warnings:
self.warnings = ["WARNING: " + i for i in self.warnings]
Expand All @@ -1165,12 +1171,25 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me

return self.is_valid

def write_check(self):
"""Check that the h5ad can be written back out."""
logger.warning("Checking that the h5ad can be written back out. This may take a while.")
import tempfile

with tempfile.TemporaryDirectory() as tmpdir:
tmp_path = os.path.join(tmpdir, "validated.h5ad")
try:
self.adata.write(tmp_path)
except Exception as e:
self.errors.append(f"Unable to write back to h5ad: {e}")


def validate(
h5ad_path: Union[str, bytes, os.PathLike],
add_labels_file: str = None,
ignore_labels: bool = False,
verbose: bool = False,
write_check: bool = False,
) -> (bool, list, bool):
from .write_labels import AnnDataLabelAppender

Expand All @@ -1194,8 +1213,10 @@ def validate(
validator = Validator(
ignore_labels=ignore_labels,
)
to_memory = add_labels_file is not None
validator.validate_adata(h5ad_path, to_memory=to_memory)
to_memory = (
add_labels_file is not None or write_check
) # if we're adding labels or doing a write_check, then read to memory
validator.validate_adata(h5ad_path, to_memory=to_memory, write_check=write_check)
logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}")

# Stop if validation was unsuccessful
Expand Down
35 changes: 35 additions & 0 deletions cellxgene_schema_cli/tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@

import anndata
import numpy as np
import pandas as pd
import pytest
from cellxgene_schema.ontology import OntologyChecker
from cellxgene_schema.schema import get_schema_definition
from cellxgene_schema.utils import read_h5ad
from cellxgene_schema.validate import Validator, validate
from cellxgene_schema.write_labels import AnnDataLabelAppender
from fixtures.examples_validate import (
Expand Down Expand Up @@ -312,6 +314,20 @@ def test__validate_with_h5ad_invalid_and_without_labels(self):
self.assertTrue(errors)
self.assertTrue(is_seurat_convertible)

@mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad)
@mock.patch("cellxgene_schema.validate.Validator.write_check")
def test__validate_with_write_check_false(self, mock_write_check, mock_read_h5ad):
validate(h5ad_valid)
mock_write_check.assert_not_called()
mock_read_h5ad.assert_called_with(h5ad_valid, False)

@mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad)
@mock.patch("cellxgene_schema.validate.Validator.write_check")
def test__validate_with_write_check_true(self, mock_write_check, mock_read_h5ad):
validate(h5ad_valid, write_check=True)
mock_write_check.assert_called_once()
mock_read_h5ad.assert_called_with(h5ad_valid, True)


class TestSeuratConvertibility(unittest.TestCase):
def validation_helper(self, matrix, raw=None):
Expand Down Expand Up @@ -366,6 +382,25 @@ def test_determine_seurat_convertibility(self):
self.assertFalse(self.validator.is_valid)


class TestValidatorWriteCheck:
def test_fail(self):
validator = Validator()
X = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]])
obs = pd.DataFrame([["red", 1, 0.22222], ["blue", 0, np.nan], ["orange", 1, 0.1]])
adata = anndata.AnnData(X=X, obs=obs)
validator.adata = adata

validator.write_check()
assert "Unable to write back to h5ad" in validator.errors[0]

def test_success(self):
validator = Validator()
validator.adata = adata_minimal.copy()

validator.write_check()
assert len(validator.errors) == 0


class TestIsRaw:
@staticmethod
def create_validator(data: Union[ndarray, spmatrix], format: str) -> Validator:
Expand Down
Loading