Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(4.0): Add check for conditions that would cause anndata.write to fail. #643

Merged
merged 17 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions cellxgene_schema_cli/cellxgene_schema/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,15 @@
)
@click.option("-i", "--ignore-labels", help="Ignore ontology labels when validating", is_flag=True)
@click.option("-v", "--verbose", help="When present will set logging level to debug", is_flag=True)
def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose):
@click.option(

Check warning on line 35 in cellxgene_schema_cli/cellxgene_schema/cli.py

View check run for this annotation

Codecov / codecov/patch

cellxgene_schema_cli/cellxgene_schema/cli.py#L35

Added line #L35 was not covered by tests
"-w",
"--write-check",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jahilton I'd like to get your thoughts on this new flag before I merge.

help="This checks if the dataset can be written with added labels. This is a slower operation which requires more "
"memory and disk space. It is not recommended to run this on dataset that will not fit in memory. This check "
"will no longer be required with anndata 0.10.0.",
is_flag=True,
)
def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_check):

Check warning on line 43 in cellxgene_schema_cli/cellxgene_schema/cli.py

View check run for this annotation

Codecov / codecov/patch

cellxgene_schema_cli/cellxgene_schema/cli.py#L43

Added line #L43 was not covered by tests
# Imports are very slow so we defer loading until Click arg validation has passed

print("Loading dependencies")
Expand All @@ -44,7 +52,9 @@
print("Loading validator modules")
from .validate import validate

is_valid, _, _ = validate(h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose)
is_valid, _, _ = validate(

Check warning on line 55 in cellxgene_schema_cli/cellxgene_schema/cli.py

View check run for this annotation

Codecov / codecov/patch

cellxgene_schema_cli/cellxgene_schema/cli.py#L55

Added line #L55 was not covered by tests
h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose, write_check=write_check
)
if is_valid:
sys.exit(0)
else:
Expand Down
63 changes: 48 additions & 15 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,18 +644,30 @@ def _validate_dataframe(self, df_name: str):
f"Features SHOULD NOT be filtered from expression matrix."
)

# Check for columns that have a category defined 0 times (obs only)
if df_name == "obs":
for column in df.columns:
col = df[column]
if col.dtype != "category":
continue
for category in col.dtype.categories:
if category not in col.values:
self.warnings.append(
f"Column '{column}' in dataframe '{df_name}' contains a category '{category}' with "
f"zero observations. These categories will be removed when `--add-labels` flag is present."
)
for column_name in df.columns:
column = df[column_name]
if column.dtype != "category":
nayib-jose-gloria marked this conversation as resolved.
Show resolved Hide resolved
# Check for columns with mixed values.
value_types = {type(x) for x in column.values}
if len(value_types) != 1:
self.errors.append(
f"Column '{column_name}' in dataframe '{df_name}' cannot contained mixed types. Found {value_types}."
Bento007 marked this conversation as resolved.
Show resolved Hide resolved
)
else:
# Check for columns that have a category defined 0 times (obs only)
if df_name == "obs":
for category in column.dtype.categories:
if category not in column.values:
self.warnings.append(
f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with "
f"zero observations. These categories will be removed when `--add-labels` flag is present."
)
# Check for columns that have none string catagories
catagory_types = {type(x) for x in column.dtype.categories.values}
if len(catagory_types) > 1 or str not in catagory_types:
self.errors.append(
f"Column '{column_name}' in dataframe '{df_name}' must only contain string catagories. Found {catagory_types}."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit catagories -> categories

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same w/ catagory_types var name

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, I think it is true that we only support str categories in our current schema definition. But are we 100% certain that will always be true? If this is an anndata constraint, mind linking me to a doc or comment specifying this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not documented in anndata because it is a bug. Anndata should support not string values as categories and this is fixed in anndata 0.10.0.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha. Mind adding a comment to this effect--maybe a TODO so we remember to remove this check for anndata 0.10.0 and/or if we get a non-string categorical field in schema 5 we remember to highlight this anndata 0.8.0 bug to product?

)

# Validate columns
if "columns" in df_definition:
Expand Down Expand Up @@ -1302,13 +1314,16 @@ def _deep_check(self):
"fixing current errors."
)

def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False) -> bool:
def validate_adata(
self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False, write_check: bool = False
) -> bool:
"""
Validates adata

:params Union[str, bytes, os.PathLike] h5ad_path: path to h5ad to validate, if None it will try to validate
from self.adata
:param to_memory: indicate if the h5ad should be read into memory.
:param write_check: indicate if write_check should be performed.

:return True if successful validation, False otherwise
:rtype bool
Expand All @@ -1330,6 +1345,9 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me
if not self.errors:
self._deep_check()

if write_check:
self.write_check()

# Print warnings if any
if self.warnings:
self.warnings = ["WARNING: " + i for i in self.warnings]
Expand All @@ -1347,12 +1365,25 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me

return self.is_valid

def write_check(self):
"""Check that the h5ad can be written back out."""
logger.warning("Checking that the h5ad can be written back out. This may take a while.")
import tempfile

with tempfile.TemporaryDirectory() as tmpdir:
tmp_path = os.path.join(tmpdir, "validated.h5ad")
try:
self.adata.write(tmp_path)
except Exception as e:
self.errors.append(f"Unable to write back to h5ad: {e}")


def validate(
h5ad_path: Union[str, bytes, os.PathLike],
add_labels_file: str = None,
ignore_labels: bool = False,
verbose: bool = False,
write_check: bool = False,
) -> (bool, list, bool):
from .write_labels import AnnDataLabelAppender

Expand All @@ -1376,8 +1407,10 @@ def validate(
validator = Validator(
ignore_labels=ignore_labels,
)
to_memory = add_labels_file is not None
validator.validate_adata(h5ad_path, to_memory=to_memory)
to_memory = (
add_labels_file is not None or write_check
) # if we're adding labels or doing a write_check, then read to memory
validator.validate_adata(h5ad_path, to_memory=to_memory, write_check=write_check)
logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}")

# Stop if validation was unsuccessful
Expand Down
4 changes: 2 additions & 2 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,9 +911,9 @@ def test_nan_values_must_be_rejected(self):
"""
self.validator.adata.obs.loc[self.validator.adata.obs.index[0], "tissue_ontology_term_id"] = numpy.nan
self.validator.validate_adata()
self.assertEqual(
self.assertIn(
"ERROR: Column 'tissue_ontology_term_id' in dataframe 'obs' must not contain NaN values.",
self.validator.errors,
["ERROR: Column 'tissue_ontology_term_id' in dataframe 'obs' must not contain NaN values."],
)


Expand Down
66 changes: 63 additions & 3 deletions cellxgene_schema_cli/tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,17 @@

import anndata
import numpy as np
import pandas as pd
import pytest
from cellxgene_schema.ontology import OntologyChecker
from cellxgene_schema.schema import get_schema_definition
from cellxgene_schema.utils import read_h5ad
from cellxgene_schema.validate import Validator, validate
from cellxgene_schema.write_labels import AnnDataLabelAppender
from fixtures.examples_validate import (
adata,
adata as adata_valid,
)
from fixtures.examples_validate import (
adata_minimal,
adata_with_labels,
good_obs,
Expand Down Expand Up @@ -97,7 +101,7 @@ def test_validate_ontology_wrong_term(self):
class TestAddLabelFunctions(unittest.TestCase):
def setUp(self):
# Set up test data
self.test_adata = adata.copy()
self.test_adata = adata_valid.copy()
self.test_adata_with_labels = adata_with_labels
self.schema_def = get_schema_definition()

Expand Down Expand Up @@ -235,7 +239,7 @@ def test__write__Fail(self):
class TestIgnoreLabelFunctions(unittest.TestCase):
def setUp(self):
# Set up test data
self.test_adata = adata
self.test_adata = adata_valid.copy()
self.test_adata_with_labels = adata_with_labels

def test_validating_labeled_h5ad_should_fail_if_no_flag_set(self):
Expand Down Expand Up @@ -312,6 +316,20 @@ def test__validate_with_h5ad_invalid_and_without_labels(self):
self.assertTrue(errors)
self.assertTrue(is_seurat_convertible)

@mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad)
@mock.patch("cellxgene_schema.validate.Validator.write_check")
def test__validate_with_write_check_false(self, mock_write_check, mock_read_h5ad):
validate(h5ad_valid)
mock_write_check.assert_not_called()
mock_read_h5ad.assert_called_with(h5ad_valid, False)

@mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad)
@mock.patch("cellxgene_schema.validate.Validator.write_check")
def test__validate_with_write_check_true(self, mock_write_check, mock_read_h5ad):
validate(h5ad_valid, write_check=True)
mock_write_check.assert_called_once()
mock_read_h5ad.assert_called_with(h5ad_valid, True)


class TestSeuratConvertibility(unittest.TestCase):
def validation_helper(self, matrix, raw=None):
Expand Down Expand Up @@ -366,6 +384,48 @@ def test_determine_seurat_convertibility(self):
self.assertFalse(self.validator.is_valid)


class TestValidatorValidateDataFrame:
def test_fail_catagory_not_string(self):
validator = Validator()
validator._set_schema_def()
adata = adata_valid.copy()
t = pd.CategoricalDtype(categories=[True, False])
adata.obs["not_string"] = pd.Series(data=[True, False], index=["X", "Y"], dtype=t)
validator.adata = adata

validator._validate_dataframe("obs")
assert "must only contain string catagories." in validator.errors[0]

def test_fail_mixed_column_types(self):
validator = Validator()
validator._set_schema_def()
adata = adata_valid.copy()
adata.obs["mixed"] = pd.Series(data=["1234", 0], index=["X", "Y"])
validator.adata = adata

validator._validate_dataframe("obs")
assert "cannot contained mixed types." in validator.errors[0]


class TestValidatorWriteCheck:
def test_fail(self):
validator = Validator()
X = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]])
obs = pd.DataFrame([["red", 1, 0.22222], ["blue", 0, np.nan], ["orange", 1, 0.1]])
adata = anndata.AnnData(X=X, obs=obs)
validator.adata = adata

validator.write_check()
assert "Unable to write back to h5ad" in validator.errors[0]

def test_success(self):
validator = Validator()
validator.adata = adata_minimal.copy()

validator.write_check()
assert len(validator.errors) == 0


class TestIsRaw:
@staticmethod
def create_validator(data: Union[ndarray, spmatrix], format: str) -> Validator:
Expand Down
Loading