Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: fix X_{suffix} validation and add test coverage #635

Merged
merged 5 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,9 @@ def _validate_embedding_dict(self):

obsm_with_x_prefix = 0
for key, value in self.adata.obsm.items():
if " " in key:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think any keys that have whitespace in them should be invalid, regardless of whether it's an X_{suffix} key

self.errors.append(f"Embedding key {key} has whitespace in it, please remove it.")

if not isinstance(value, np.ndarray):
self.errors.append(
f"All embeddings have to be of 'numpy.ndarray' type, " f"'adata.obsm['{key}']' is {type(value)}')."
Expand All @@ -832,20 +835,28 @@ def _validate_embedding_dict(self):
if key.startswith("X_"):
obsm_with_x_prefix += 1

if not (np.issubdtype(value.dtype, np.integer) or np.issubdtype(value.dtype, np.floating)):
if len(key) <= 3:
self.errors.append(
f"adata.obsm['{key}'] has an invalid data type. It should be "
"float, integer, or unsigned integer of any precision (8, 16, 32, or 64 bits)."
f"Embedding key in 'adata.obsm' {key} must have a suffix at least one character long."
)
if np.isinf(value).any() or np.isnan(value).any():
self.errors.append(f"adata.obsm['{key}'] contains positive infinity or negative infinity values.")
if np.isnan(value).any():
self.errors.append(f"adata.obsm['{key}'] contains NaN values.")
if len(value.shape) < 2 or value.shape[0] != self.adata.n_obs or value.shape[1] < 2:
self.errors.append(
f"All embeddings must have as many rows as cells, and at least two columns."
f"'adata.obsm['{key}']' has shape of '{value.shape}'."
)
if not (np.issubdtype(value.dtype, np.integer) or np.issubdtype(value.dtype, np.floating)):
self.errors.append(
f"adata.obsm['{key}'] has an invalid data type. It should be "
"float, integer, or unsigned integer of any precision (8, 16, 32, or 64 bits)."
)
else:
# Check for inf/NaN values only if the dtype is numeric
if np.isinf(value).any():
self.errors.append(
f"adata.obsm['{key}'] contains positive infinity or negative infinity values."
)
if np.all(np.isnan(value)):
self.errors.append(f"adata.obsm['{key}'] contains all NaN values.")

if obsm_with_x_prefix == 0:
self.errors.append("At least one embedding in 'obsm' has to have a key with an 'X_' prefix.")
Expand Down
65 changes: 65 additions & 0 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -1258,6 +1258,49 @@ def test_obsm_values_ara_numpy(self):
],
)

def test_obsm_values_infinity(self):
"""
values in obsm cannot have any infinity values
"""
self.validator.adata.obsm["X_umap"][0:100, 1] = numpy.inf
self.validator.validate_adata()
self.assertEqual(
self.validator.errors,
["ERROR: adata.obsm['X_umap'] contains positive infinity or negative infinity values."],
)

def test_obsm_values_str(self):
"""
values in obsm must be numerical types, strings are not valid
"""
all_string = numpy.full(self.validator.adata.obsm["X_umap"].shape, "test")
self.validator.adata.obsm["X_umap"] = all_string
self.validator.validate_adata()
self.assertEqual(
self.validator.errors,
["ERROR: adata.obsm['X_umap'] has an invalid data type. It should be float, integer, or unsigned "
"integer of any precision (8, 16, 32, or 64 bits)."],
)

def test_obsm_values_nan(self):
"""
values in obsm cannot all be NaN
"""

# It's okay if only one value is NaN
self.validator.adata.obsm["X_umap"][0:100, 1] = numpy.nan
self.validator.validate_adata()
self.assertEqual(self.validator.errors, [])

# It's not okay if all values are NaN
all_nan = numpy.full(self.validator.adata.obsm["X_umap"].shape, numpy.nan)
self.validator.adata.obsm["X_umap"] = all_nan
self.validator.validate_adata()
self.assertEqual(
self.validator.errors,
["ERROR: adata.obsm['X_umap'] contains all NaN values."],
)

def test_obsm_values_at_least_one_X(self):
"""
At least one key for the embedding MUST be prefixed with "X_"
Expand All @@ -1272,6 +1315,28 @@ def test_obsm_values_at_least_one_X(self):
["ERROR: At least one embedding in 'obsm' has to have a " "key with an 'X_' prefix."],
)

def test_obsm_suffix_name_valid(self):
"""
Suffix after X_ must be at least 1 character long
"""
self.validator.adata.obsm["X_"] = self.validator.adata.obsm["X_umap"]
self.validator.validate_adata()
self.assertEqual(
self.validator.errors,
["ERROR: Embedding key in 'adata.obsm' X_ must have a suffix at least one character long."],
)

def test_obsm_key_name_valid(self):
"""
Embedding keys with whitespace are not valid
"""
self.validator.adata.obsm["X_ umap"] = self.validator.adata.obsm["X_umap"]
self.validator.validate_adata()
self.assertEqual(
self.validator.errors,
["ERROR: Embedding key X_ umap has whitespace in it, please remove it."],
)

def test_obsm_shape(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test for the other condition on this if conditional? i.e. must have same number of rows as X

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i updated for the case where the number of rows is not the same as the number of cells. it actually seems to fail when setting the key, not when running the validation. but the test should reflect that

"""
Curators MUST annotate one or more two-dimensional (m >= 2) embeddings
Expand Down
Loading