diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index ba07b73b4..388cb6974 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -822,6 +822,9 @@ def _validate_embedding_dict(self): obsm_with_x_prefix = 0 for key, value in self.adata.obsm.items(): + if " " in key: + self.errors.append(f"Embedding key {key} has whitespace in it, please remove it.") + if not isinstance(value, np.ndarray): self.errors.append( f"All embeddings have to be of 'numpy.ndarray' type, " f"'adata.obsm['{key}']' is {type(value)}')." @@ -832,20 +835,28 @@ def _validate_embedding_dict(self): if key.startswith("X_"): obsm_with_x_prefix += 1 - if not (np.issubdtype(value.dtype, np.integer) or np.issubdtype(value.dtype, np.floating)): + if len(key) <= 3: self.errors.append( - f"adata.obsm['{key}'] has an invalid data type. It should be " - "float, integer, or unsigned integer of any precision (8, 16, 32, or 64 bits)." + f"Embedding key in 'adata.obsm' {key} must have a suffix at least one character long." ) - if np.isinf(value).any() or np.isnan(value).any(): - self.errors.append(f"adata.obsm['{key}'] contains positive infinity or negative infinity values.") - if np.isnan(value).any(): - self.errors.append(f"adata.obsm['{key}'] contains NaN values.") if len(value.shape) < 2 or value.shape[0] != self.adata.n_obs or value.shape[1] < 2: self.errors.append( f"All embeddings must have as many rows as cells, and at least two columns." f"'adata.obsm['{key}']' has shape of '{value.shape}'." ) + if not (np.issubdtype(value.dtype, np.integer) or np.issubdtype(value.dtype, np.floating)): + self.errors.append( + f"adata.obsm['{key}'] has an invalid data type. It should be " + "float, integer, or unsigned integer of any precision (8, 16, 32, or 64 bits)." + ) + else: + # Check for inf/NaN values only if the dtype is numeric + if np.isinf(value).any(): + self.errors.append( + f"adata.obsm['{key}'] contains positive infinity or negative infinity values." + ) + if np.all(np.isnan(value)): + self.errors.append(f"adata.obsm['{key}'] contains all NaN values.") if obsm_with_x_prefix == 0: self.errors.append("At least one embedding in 'obsm' has to have a key with an 'X_' prefix.") diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 8e4bb1fc1..ecb88a40d 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -1258,6 +1258,51 @@ def test_obsm_values_ara_numpy(self): ], ) + def test_obsm_values_infinity(self): + """ + values in obsm cannot have any infinity values + """ + self.validator.adata.obsm["X_umap"][0:100, 1] = numpy.inf + self.validator.validate_adata() + self.assertEqual( + self.validator.errors, + ["ERROR: adata.obsm['X_umap'] contains positive infinity or negative infinity values."], + ) + + def test_obsm_values_str(self): + """ + values in obsm must be numerical types, strings are not valid + """ + all_string = numpy.full(self.validator.adata.obsm["X_umap"].shape, "test") + self.validator.adata.obsm["X_umap"] = all_string + self.validator.validate_adata() + self.assertEqual( + self.validator.errors, + [ + "ERROR: adata.obsm['X_umap'] has an invalid data type. It should be float, integer, or unsigned " + "integer of any precision (8, 16, 32, or 64 bits)." + ], + ) + + def test_obsm_values_nan(self): + """ + values in obsm cannot all be NaN + """ + + # It's okay if only one value is NaN + self.validator.adata.obsm["X_umap"][0:100, 1] = numpy.nan + self.validator.validate_adata() + self.assertEqual(self.validator.errors, []) + + # It's not okay if all values are NaN + all_nan = numpy.full(self.validator.adata.obsm["X_umap"].shape, numpy.nan) + self.validator.adata.obsm["X_umap"] = all_nan + self.validator.validate_adata() + self.assertEqual( + self.validator.errors, + ["ERROR: adata.obsm['X_umap'] contains all NaN values."], + ) + def test_obsm_values_at_least_one_X(self): """ At least one key for the embedding MUST be prefixed with "X_" @@ -1272,7 +1317,29 @@ def test_obsm_values_at_least_one_X(self): ["ERROR: At least one embedding in 'obsm' has to have a " "key with an 'X_' prefix."], ) - def test_obsm_shape(self): + def test_obsm_suffix_name_valid(self): + """ + Suffix after X_ must be at least 1 character long + """ + self.validator.adata.obsm["X_"] = self.validator.adata.obsm["X_umap"] + self.validator.validate_adata() + self.assertEqual( + self.validator.errors, + ["ERROR: Embedding key in 'adata.obsm' X_ must have a suffix at least one character long."], + ) + + def test_obsm_key_name_valid(self): + """ + Embedding keys with whitespace are not valid + """ + self.validator.adata.obsm["X_ umap"] = self.validator.adata.obsm["X_umap"] + self.validator.validate_adata() + self.assertEqual( + self.validator.errors, + ["ERROR: Embedding key X_ umap has whitespace in it, please remove it."], + ) + + def test_obsm_shape_one_column(self): """ Curators MUST annotate one or more two-dimensional (m >= 2) embeddings """ @@ -1289,6 +1356,19 @@ def test_obsm_shape(self): ], ) + def test_obsm_shape_same_rows_and_columns(self): + """ + The number of rows must be equal to the number of columns + """ + # Create a 3 row array + arr1 = numpy.array([0, 0]) + arr2 = numpy.array([0, 0]) + arr3 = numpy.array([0, 0]) + three_row_array = numpy.vstack((arr1, arr2, arr3)) + with self.assertRaises(ValueError): + self.validator.adata.obsm["X_umap"] = three_row_array + self.validator.validate_adata() + class TestAddingLabels(unittest.TestCase): """