chanzuckerberg · Bento007 · Sep 27, 2023 · Sep 22, 2023 · Sep 22, 2023 · Sep 22, 2023
diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py
@@ -32,7 +32,15 @@
 )
 @click.option("-i", "--ignore-labels", help="Ignore ontology labels when validating", is_flag=True)
 @click.option("-v", "--verbose", help="When present will set logging level to debug", is_flag=True)
-def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose):
+@click.option(
+    "-w",
+    "--write-check",
+    help="This checks if the dataset can be written with added labels. This is a slower operation which requires more "
+    "memory and disk space. It is not recommended to run this on dataset that will not fit in memory. This check "
+    "will no longer be required with anndata 0.10.0.",
+    is_flag=True,
+)
+def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_check):
     # Imports are very slow so we defer loading until Click arg validation has passed
 
     print("Loading dependencies")
@@ -44,7 +52,9 @@
     print("Loading validator modules")
     from .validate import validate
 
-    is_valid, _, _ = validate(h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose)
+    is_valid, _, _ = validate(
+        h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose, write_check=write_check
+    )
     if is_valid:
         sys.exit(0)
     else:

diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -644,18 +644,30 @@ def _validate_dataframe(self, df_name: str):
                     f"Features SHOULD NOT be filtered from expression matrix."
                 )
 
-        # Check for columns that have a category defined 0 times (obs only)
-        if df_name == "obs":
-            for column in df.columns:
-                col = df[column]
-                if col.dtype != "category":
-                    continue
-                for category in col.dtype.categories:
-                    if category not in col.values:
-                        self.warnings.append(
-                            f"Column '{column}' in dataframe '{df_name}' contains a category '{category}' with "
-                            f"zero observations. These categories will be removed when `--add-labels` flag is present."
-                        )
+        for column_name in df.columns:
+            column = df[column_name]
+            if column.dtype != "category":
+                # Check for columns with mixed values.
+                value_types = {type(x) for x in column.values}
+                if len(value_types) != 1:
+                    self.errors.append(
+                        f"Column '{column_name}' in dataframe '{df_name}' cannot contained mixed types. Found {value_types}."
+                    )
+            else:
+                # Check for columns that have a category defined 0 times (obs only)
+                if df_name == "obs":
+                    for category in column.dtype.categories:
+                        if category not in column.values:
+                            self.warnings.append(
+                                f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with "
+                                f"zero observations. These categories will be removed when `--add-labels` flag is present."
+                            )
+                # Check for columns that have none string catagories
+                catagory_types = {type(x) for x in column.dtype.categories.values}
+                if len(catagory_types) > 1 or str not in catagory_types:
+                    self.errors.append(
+                        f"Column '{column_name}' in dataframe '{df_name}' must only contain string catagories. Found {catagory_types}."
+                    )
 
         # Validate columns
         if "columns" in df_definition:
@@ -1302,13 +1314,16 @@ def _deep_check(self):
                 "fixing current errors."
             )
 
-    def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False) -> bool:
+    def validate_adata(
+        self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False, write_check: bool = False
+    ) -> bool:
         """
         Validates adata
 
         :params Union[str, bytes, os.PathLike] h5ad_path: path to h5ad to validate, if None it will try to validate
         from self.adata
         :param to_memory: indicate if the h5ad should be read into memory.
+        :param write_check: indicate if write_check should be performed.
 
         :return True if successful validation, False otherwise
         :rtype bool
@@ -1330,6 +1345,9 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me
         if not self.errors:
             self._deep_check()
 
+        if write_check:
+            self.write_check()
+
         # Print warnings if any
         if self.warnings:
             self.warnings = ["WARNING: " + i for i in self.warnings]
@@ -1347,12 +1365,25 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me
 
         return self.is_valid
 
+    def write_check(self):
+        """Check that the h5ad can be written back out."""
+        logger.warning("Checking that the h5ad can be written back out. This may take a while.")
+        import tempfile
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmp_path = os.path.join(tmpdir, "validated.h5ad")
+            try:
+                self.adata.write(tmp_path)
+            except Exception as e:
+                self.errors.append(f"Unable to write back to h5ad: {e}")
+
 
 def validate(
     h5ad_path: Union[str, bytes, os.PathLike],
     add_labels_file: str = None,
     ignore_labels: bool = False,
     verbose: bool = False,
+    write_check: bool = False,
 ) -> (bool, list, bool):
     from .write_labels import AnnDataLabelAppender
 
@@ -1376,8 +1407,10 @@ def validate(
     validator = Validator(
         ignore_labels=ignore_labels,
     )
-    to_memory = add_labels_file is not None
-    validator.validate_adata(h5ad_path, to_memory=to_memory)
+    to_memory = (
+        add_labels_file is not None or write_check
+    )  # if we're adding labels or doing a write_check, then read to memory
+    validator.validate_adata(h5ad_path, to_memory=to_memory, write_check=write_check)
     logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}")
 
     # Stop if validation was unsuccessful

diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py
@@ -911,9 +911,9 @@ def test_nan_values_must_be_rejected(self):
         """
         self.validator.adata.obs.loc[self.validator.adata.obs.index[0], "tissue_ontology_term_id"] = numpy.nan
         self.validator.validate_adata()
-        self.assertEqual(
+        self.assertIn(
+            "ERROR: Column 'tissue_ontology_term_id' in dataframe 'obs' must not contain NaN values.",
             self.validator.errors,
-            ["ERROR: Column 'tissue_ontology_term_id' in dataframe 'obs' must not contain NaN values."],
         )
 
 

diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
@@ -8,13 +8,17 @@
 
 import anndata
 import numpy as np
+import pandas as pd
 import pytest
 from cellxgene_schema.ontology import OntologyChecker
 from cellxgene_schema.schema import get_schema_definition
+from cellxgene_schema.utils import read_h5ad
 from cellxgene_schema.validate import Validator, validate
 from cellxgene_schema.write_labels import AnnDataLabelAppender
 from fixtures.examples_validate import (
-    adata,
+    adata as adata_valid,
+)
+from fixtures.examples_validate import (
     adata_minimal,
     adata_with_labels,
     good_obs,
@@ -97,7 +101,7 @@ def test_validate_ontology_wrong_term(self):
 class TestAddLabelFunctions(unittest.TestCase):
     def setUp(self):
         # Set up test data
-        self.test_adata = adata.copy()
+        self.test_adata = adata_valid.copy()
         self.test_adata_with_labels = adata_with_labels
         self.schema_def = get_schema_definition()
 
@@ -235,7 +239,7 @@ def test__write__Fail(self):
 class TestIgnoreLabelFunctions(unittest.TestCase):
     def setUp(self):
         # Set up test data
-        self.test_adata = adata
+        self.test_adata = adata_valid.copy()
         self.test_adata_with_labels = adata_with_labels
 
     def test_validating_labeled_h5ad_should_fail_if_no_flag_set(self):
@@ -312,6 +316,20 @@ def test__validate_with_h5ad_invalid_and_without_labels(self):
         self.assertTrue(errors)
         self.assertTrue(is_seurat_convertible)
 
+    @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad)
+    @mock.patch("cellxgene_schema.validate.Validator.write_check")
+    def test__validate_with_write_check_false(self, mock_write_check, mock_read_h5ad):
+        validate(h5ad_valid)
+        mock_write_check.assert_not_called()
+        mock_read_h5ad.assert_called_with(h5ad_valid, False)
+
+    @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad)
+    @mock.patch("cellxgene_schema.validate.Validator.write_check")
+    def test__validate_with_write_check_true(self, mock_write_check, mock_read_h5ad):
+        validate(h5ad_valid, write_check=True)
+        mock_write_check.assert_called_once()
+        mock_read_h5ad.assert_called_with(h5ad_valid, True)
+
 
 class TestSeuratConvertibility(unittest.TestCase):
     def validation_helper(self, matrix, raw=None):
@@ -366,6 +384,48 @@ def test_determine_seurat_convertibility(self):
         self.assertFalse(self.validator.is_valid)
 
 
+class TestValidatorValidateDataFrame:
+    def test_fail_catagory_not_string(self):
+        validator = Validator()
+        validator._set_schema_def()
+        adata = adata_valid.copy()
+        t = pd.CategoricalDtype(categories=[True, False])
+        adata.obs["not_string"] = pd.Series(data=[True, False], index=["X", "Y"], dtype=t)
+        validator.adata = adata
+
+        validator._validate_dataframe("obs")
+        assert "must only contain string catagories." in validator.errors[0]
+
+    def test_fail_mixed_column_types(self):
+        validator = Validator()
+        validator._set_schema_def()
+        adata = adata_valid.copy()
+        adata.obs["mixed"] = pd.Series(data=["1234", 0], index=["X", "Y"])
+        validator.adata = adata
+
+        validator._validate_dataframe("obs")
+        assert "cannot contained mixed types." in validator.errors[0]
+
+
+class TestValidatorWriteCheck:
+    def test_fail(self):
+        validator = Validator()
+        X = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]])
+        obs = pd.DataFrame([["red", 1, 0.22222], ["blue", 0, np.nan], ["orange", 1, 0.1]])
+        adata = anndata.AnnData(X=X, obs=obs)
+        validator.adata = adata
+
+        validator.write_check()
+        assert "Unable to write back to h5ad" in validator.errors[0]
+
+    def test_success(self):
+        validator = Validator()
+        validator.adata = adata_minimal.copy()
+
+        validator.write_check()
+        assert len(validator.errors) == 0
+
+
 class TestIsRaw:
     @staticmethod
     def create_validator(data: Union[ndarray, spmatrix], format: str) -> Validator: