From 20162bddde0d08278d4ef94a06fc0a42b5531e39 Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Fri, 22 Sep 2023 14:43:21 -0700
Subject: [PATCH 01/14] feat(4.0): Add "write-check" flag to check if labels
 can be added

---
 cellxgene_schema_cli/cellxgene_schema/cli.py  | 14 ++++++--
 .../cellxgene_schema/validate.py              | 27 ++++++++++++--
 cellxgene_schema_cli/tests/test_validate.py   | 35 +++++++++++++++++++
 3 files changed, 71 insertions(+), 5 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py
index 3d917cf6e..5f0cea1f2 100644
--- a/cellxgene_schema_cli/cellxgene_schema/cli.py
+++ b/cellxgene_schema_cli/cellxgene_schema/cli.py
@@ -32,7 +32,15 @@ def schema_cli():
 )
 @click.option("-i", "--ignore-labels", help="Ignore ontology labels when validating", is_flag=True)
 @click.option("-v", "--verbose", help="When present will set logging level to debug", is_flag=True)
-def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose):
+@click.option(
+    "--write-check",
+    help="When present will run write_check on of the data. This check if the dataset "
+    "can be written back out with added labels. This is a slower, more memory and "
+    "disk intensive operation. It is not recommended to run this on dataset that will "
+    "not fit in memory.",
+    is_flag=True,
+)
+def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_check):
     # Imports are very slow so we defer loading until Click arg validation has passed
 
     print("Loading dependencies")
@@ -44,7 +52,9 @@ def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose):
     print("Loading validator modules")
     from .validate import validate
 
-    is_valid, _, _ = validate(h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose)
+    is_valid, _, _ = validate(
+        h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose, write_check=write_check
+    )
     if is_valid:
         sys.exit(0)
     else:
diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
index 21b369a63..b7564bbcd 100644
--- a/cellxgene_schema_cli/cellxgene_schema/validate.py
+++ b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -1120,13 +1120,16 @@ def _deep_check(self):
                 "fixing current errors."
             )
 
-    def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False) -> bool:
+    def validate_adata(
+        self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False, write_check: bool = False
+    ) -> bool:
         """
         Validates adata
 
         :params Union[str, bytes, os.PathLike] h5ad_path: path to h5ad to validate, if None it will try to validate
         from self.adata
         :param to_memory: indicate if the h5ad should be read into memory.
+        :param write_check: indicate if write_check should be performed.
 
         :return True if successful validation, False otherwise
         :rtype bool
@@ -1148,6 +1151,9 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me
         if not self.errors:
             self._deep_check()
 
+        if write_check:
+            self.write_check()
+
         # Print warnings if any
         if self.warnings:
             self.warnings = ["WARNING: " + i for i in self.warnings]
@@ -1165,12 +1171,25 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me
 
         return self.is_valid
 
+    def write_check(self):
+        """Check that the h5ad can be written back out."""
+        logger.warning("Checking that the h5ad can be written back out. This may take a while.")
+        import tempfile
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmp_path = os.path.join(tmpdir, "validated.h5ad")
+            try:
+                self.adata.write(tmp_path)
+            except Exception as e:
+                self.errors.append(f"Unable to write back to h5ad: {e}")
+
 
 def validate(
     h5ad_path: Union[str, bytes, os.PathLike],
     add_labels_file: str = None,
     ignore_labels: bool = False,
     verbose: bool = False,
+    write_check: bool = False,
 ) -> (bool, list, bool):
     from .write_labels import AnnDataLabelAppender
 
@@ -1194,8 +1213,10 @@ def validate(
     validator = Validator(
         ignore_labels=ignore_labels,
     )
-    to_memory = add_labels_file is not None
-    validator.validate_adata(h5ad_path, to_memory=to_memory)
+    to_memory = (
+        add_labels_file is not None or write_check
+    )  # if we're adding labels or doing a full validation, read to memory
+    validator.validate_adata(h5ad_path, to_memory=to_memory, write_check=write_check)
     logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}")
 
     # Stop if validation was unsuccessful
diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
index 2f918f7d8..a2653a881 100644
--- a/cellxgene_schema_cli/tests/test_validate.py
+++ b/cellxgene_schema_cli/tests/test_validate.py
@@ -8,6 +8,7 @@
 
 import anndata
 import numpy as np
+import pandas as pd
 import pytest
 from cellxgene_schema.ontology import OntologyChecker
 from cellxgene_schema.schema import get_schema_definition
@@ -27,6 +28,7 @@
 from numpy import ndarray
 from scipy import sparse
 from scipy.sparse import spmatrix
+from utils import read_h5ad
 
 # Tests for internal functions of the Validator and LabelWriter classes.
 
@@ -312,6 +314,20 @@ def test__validate_with_h5ad_invalid_and_without_labels(self):
         self.assertTrue(errors)
         self.assertTrue(is_seurat_convertible)
 
+    @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad)
+    @mock.patch("cellxgene_schema.validate.Validator.write_check")
+    def test__validate_with_write_check_false(self, mock_write_check, mock_read_h5ad):
+        validate(h5ad_valid)
+        mock_write_check.assert_not_called()
+        mock_read_h5ad.assert_called_with(h5ad_valid, False)
+
+    @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad)
+    @mock.patch("cellxgene_schema.validate.Validator.write_check")
+    def test__validate_with_write_check_true(self, mock_write_check, mock_read_h5ad):
+        validate(h5ad_valid, full=True)
+        mock_write_check.assert_called_once()
+        mock_read_h5ad.assert_called_with(h5ad_valid, True)
+
 
 class TestSeuratConvertibility(unittest.TestCase):
     def validation_helper(self, matrix, raw=None):
@@ -366,6 +382,25 @@ def test_determine_seurat_convertibility(self):
         self.assertFalse(self.validator.is_valid)
 
 
+class TestValidatorWriteCheck:
+    def test_fail(self):
+        validator = Validator()
+        X = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]])
+        obs = pd.DataFrame([["red", 1, 0.22222], ["blue", 0, np.nan], ["orange", 1, 0.1]])
+        adata = anndata.AnnData(X=X, obs=obs)
+        validator.adata = adata
+
+        validator.write_check()
+        assert "Unable to write back to h5ad" in validator.errors[0]
+
+    def test_success(self):
+        validator = Validator()
+        validator.adata = h5ad_valid
+
+        validator.write_check()
+        assert len(validator.errors) == 0
+
+
 class TestIsRaw:
     @staticmethod
     def create_validator(data: Union[ndarray, spmatrix], format: str) -> Validator:

From 9e733070509bcc2e32f46493b6e923012d48e18e Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Fri, 22 Sep 2023 14:49:23 -0700
Subject: [PATCH 02/14] fix import

---
 cellxgene_schema_cli/tests/test_validate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
index a2653a881..b2b0c7a07 100644
--- a/cellxgene_schema_cli/tests/test_validate.py
+++ b/cellxgene_schema_cli/tests/test_validate.py
@@ -12,6 +12,7 @@
 import pytest
 from cellxgene_schema.ontology import OntologyChecker
 from cellxgene_schema.schema import get_schema_definition
+from cellxgene_schema.utils import read_h5ad
 from cellxgene_schema.validate import Validator, validate
 from cellxgene_schema.write_labels import AnnDataLabelAppender
 from fixtures.examples_validate import (
@@ -28,7 +29,6 @@
 from numpy import ndarray
 from scipy import sparse
 from scipy.sparse import spmatrix
-from utils import read_h5ad
 
 # Tests for internal functions of the Validator and LabelWriter classes.
 

From d85e0d5ac05eaac951cd9b0e70d7fec3cbeec2c7 Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Fri, 22 Sep 2023 15:04:32 -0700
Subject: [PATCH 03/14] fix test

---
 cellxgene_schema_cli/cellxgene_schema/validate.py | 2 +-
 cellxgene_schema_cli/tests/test_validate.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
index b7564bbcd..f4ccb517c 100644
--- a/cellxgene_schema_cli/cellxgene_schema/validate.py
+++ b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -1215,7 +1215,7 @@ def validate(
     )
     to_memory = (
         add_labels_file is not None or write_check
-    )  # if we're adding labels or doing a full validation, read to memory
+    )  # if we're adding labels or doing a write_check, then read to memory
     validator.validate_adata(h5ad_path, to_memory=to_memory, write_check=write_check)
     logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}")
 
diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
index b2b0c7a07..da94ed5e1 100644
--- a/cellxgene_schema_cli/tests/test_validate.py
+++ b/cellxgene_schema_cli/tests/test_validate.py
@@ -324,7 +324,7 @@ def test__validate_with_write_check_false(self, mock_write_check, mock_read_h5ad
     @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad)
     @mock.patch("cellxgene_schema.validate.Validator.write_check")
     def test__validate_with_write_check_true(self, mock_write_check, mock_read_h5ad):
-        validate(h5ad_valid, full=True)
+        validate(h5ad_valid, write_check=True)
         mock_write_check.assert_called_once()
         mock_read_h5ad.assert_called_with(h5ad_valid, True)
 

From 6ddc7d550ca828ac24697aa92889ad0a8515fc17 Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Mon, 25 Sep 2023 10:06:25 -0700
Subject: [PATCH 04/14] fix test

---
 cellxgene_schema_cli/cellxgene_schema/cli.py | 3 ++-
 cellxgene_schema_cli/tests/test_validate.py  | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py
index 5f0cea1f2..c85264852 100644
--- a/cellxgene_schema_cli/cellxgene_schema/cli.py
+++ b/cellxgene_schema_cli/cellxgene_schema/cli.py
@@ -33,11 +33,12 @@ def schema_cli():
 @click.option("-i", "--ignore-labels", help="Ignore ontology labels when validating", is_flag=True)
 @click.option("-v", "--verbose", help="When present will set logging level to debug", is_flag=True)
 @click.option(
+    "-w",
     "--write-check",
     help="When present will run write_check on of the data. This check if the dataset "
     "can be written back out with added labels. This is a slower, more memory and "
     "disk intensive operation. It is not recommended to run this on dataset that will "
-    "not fit in memory.",
+    "not fit in memory. This check will no longer be required with anndata 0.10.0.",
     is_flag=True,
 )
 def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_check):
diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
index da94ed5e1..7728eae2f 100644
--- a/cellxgene_schema_cli/tests/test_validate.py
+++ b/cellxgene_schema_cli/tests/test_validate.py
@@ -395,7 +395,7 @@ def test_fail(self):
 
     def test_success(self):
         validator = Validator()
-        validator.adata = h5ad_valid
+        validator.adata = adata_minimal.copy()
 
         validator.write_check()
         assert len(validator.errors) == 0

From d33494f40a56b70190f8a8636d2d6364929d4c45 Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Mon, 25 Sep 2023 15:12:46 -0700
Subject: [PATCH 05/14] Add catches for known cases where the h5ad might fail
 to write.

- Having mixed types in a column
- Having none string values for Catagories
---
 .../cellxgene_schema/validate.py              | 34 +++++++++++-------
 cellxgene_schema_cli/tests/test_validate.py   | 35 ++++++++++++++++---
 2 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
index f4ccb517c..5720206f9 100644
--- a/cellxgene_schema_cli/cellxgene_schema/validate.py
+++ b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -643,18 +643,28 @@ def _validate_dataframe(self, df_name: str):
                     f"Features SHOULD NOT be filtered from expression matrix."
                 )
 
-        # Check for columns that have a category defined 0 times (obs only)
-        if df_name == "obs":
-            for column in df.columns:
-                col = df[column]
-                if col.dtype != "category":
-                    continue
-                for category in col.dtype.categories:
-                    if category not in col.values:
-                        self.warnings.append(
-                            f"Column '{column}' in dataframe '{df_name}' contains a category '{category}' with "
-                            f"zero observations. These categories will be removed when `--add-labels` flag is present."
-                        )
+        for column_name in df.columns:
+            column = df[column_name]
+            if column.dtype != "category":
+                value_types = {type(x) for x in column.values}
+                if len(value_types) != 1:
+                    self.errors.append(
+                        f"Column '{column_name}' in dataframe '{df_name}' cannot contained mixed types. Found {value_types}."
+                    )
+            else:
+                # Check for columns that have a category defined 0 times (obs only)
+                if df_name == "obs":
+                    for category in column.dtype.categories:
+                        if category not in column.values:
+                            self.warnings.append(
+                                f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with "
+                                f"zero observations. These categories will be removed when `--add-labels` flag is present."
+                            )
+                catagory_types = {type(x) for x in column.dtype.categories.values}
+                if len(catagory_types) > 1 or str not in catagory_types:
+                    self.errors.append(
+                        f"Column '{column_name}' in dataframe '{df_name}' must only contain string catagories. Found {catagory_types}."
+                    )
 
         # Validate columns
         if "columns" in df_definition:
diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
index 7728eae2f..ce8fdf5e6 100644
--- a/cellxgene_schema_cli/tests/test_validate.py
+++ b/cellxgene_schema_cli/tests/test_validate.py
@@ -16,7 +16,9 @@
 from cellxgene_schema.validate import Validator, validate
 from cellxgene_schema.write_labels import AnnDataLabelAppender
 from fixtures.examples_validate import (
-    adata,
+    adata as adata_valid,
+)
+from fixtures.examples_validate import (
     adata_minimal,
     adata_with_labels,
     good_obs,
@@ -99,7 +101,7 @@ def test_validate_ontology_wrong_term(self):
 class TestAddLabelFunctions(unittest.TestCase):
     def setUp(self):
         # Set up test data
-        self.test_adata = adata.copy()
+        self.test_adata = adata_valid.copy()
         self.test_adata_with_labels = adata_with_labels
         self.schema_def = get_schema_definition()
 
@@ -237,7 +239,7 @@ def test__write__Fail(self):
 class TestIgnoreLabelFunctions(unittest.TestCase):
     def setUp(self):
         # Set up test data
-        self.test_adata = adata
+        self.test_adata = adata_valid.copy()
         self.test_adata_with_labels = adata_with_labels
 
     def test_validating_labeled_h5ad_should_fail_if_no_flag_set(self):
@@ -270,7 +272,9 @@ def test__validate_with_h5ad_valid_and_labels(self):
         with tempfile.TemporaryDirectory() as temp_dir:
             labels_path = "/".join([temp_dir, "labels.h5ad"])
 
-            success, errors, is_seurat_convertible = validate(h5ad_valid, labels_path)
+            success, errors, is_seurat_convertible = validate(
+                "/Users/trentsmith/workspace/single-cell-curation/example_addlabelsFail.h5ad", labels_path
+            )
 
             import anndata as ad
 
@@ -382,6 +386,29 @@ def test_determine_seurat_convertibility(self):
         self.assertFalse(self.validator.is_valid)
 
 
+class TestValidatorValidateDataFrame:
+    def test_fail_catagory_not_string(self):
+        validator = Validator()
+        validator._set_schema_def()
+        adata = adata_valid.copy()
+        t = pd.CategoricalDtype(categories=[True, False])
+        adata.obs["not_string"] = pd.Series(data=[True, False], index=["X", "Y"], dtype=t)
+        validator.adata = adata
+
+        validator._validate_dataframe("obs")
+        assert "must only contain string catagories." in validator.errors[0]
+
+    def test_fail_mixed_column_types(self):
+        validator = Validator()
+        validator._set_schema_def()
+        adata = adata_valid.copy()
+        adata.obs["mixed"] = pd.Series(data=["1234", 0], index=["X", "Y"])
+        validator.adata = adata
+
+        validator._validate_dataframe("obs")
+        assert "cannot contained mixed types." in validator.errors[0]
+
+
 class TestValidatorWriteCheck:
     def test_fail(self):
         validator = Validator()

From 554d5aa3a4520045c81e3a999a142930261487be Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Mon, 25 Sep 2023 15:21:58 -0700
Subject: [PATCH 06/14] add comments

---
 cellxgene_schema_cli/cellxgene_schema/cli.py      | 5 ++---
 cellxgene_schema_cli/cellxgene_schema/validate.py | 2 ++
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py
index c85264852..73f52b80c 100644
--- a/cellxgene_schema_cli/cellxgene_schema/cli.py
+++ b/cellxgene_schema_cli/cellxgene_schema/cli.py
@@ -35,9 +35,8 @@ def schema_cli():
 @click.option(
     "-w",
     "--write-check",
-    help="When present will run write_check on of the data. This check if the dataset "
-    "can be written back out with added labels. This is a slower, more memory and "
-    "disk intensive operation. It is not recommended to run this on dataset that will "
+    help="This checks if the dataset can be written with added labels. This is a slower operation which requires more memory and "
+    "disk space. It is not recommended to run this on dataset that will "
     "not fit in memory. This check will no longer be required with anndata 0.10.0.",
     is_flag=True,
 )
diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
index 5720206f9..13e6458d8 100644
--- a/cellxgene_schema_cli/cellxgene_schema/validate.py
+++ b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -646,6 +646,7 @@ def _validate_dataframe(self, df_name: str):
         for column_name in df.columns:
             column = df[column_name]
             if column.dtype != "category":
+                # Check for columns with mixed values.
                 value_types = {type(x) for x in column.values}
                 if len(value_types) != 1:
                     self.errors.append(
@@ -660,6 +661,7 @@ def _validate_dataframe(self, df_name: str):
                                 f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with "
                                 f"zero observations. These categories will be removed when `--add-labels` flag is present."
                             )
+                # Check for columns that have none string catagories
                 catagory_types = {type(x) for x in column.dtype.categories.values}
                 if len(catagory_types) > 1 or str not in catagory_types:
                     self.errors.append(

From 180c60b77cefa960eb20ced0c60573a564ec08d6 Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Mon, 25 Sep 2023 15:22:42 -0700
Subject: [PATCH 07/14] add comments

---
 cellxgene_schema_cli/cellxgene_schema/cli.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py
index 73f52b80c..6338927e6 100644
--- a/cellxgene_schema_cli/cellxgene_schema/cli.py
+++ b/cellxgene_schema_cli/cellxgene_schema/cli.py
@@ -35,9 +35,9 @@ def schema_cli():
 @click.option(
     "-w",
     "--write-check",
-    help="This checks if the dataset can be written with added labels. This is a slower operation which requires more memory and "
-    "disk space. It is not recommended to run this on dataset that will "
-    "not fit in memory. This check will no longer be required with anndata 0.10.0.",
+    help="This checks if the dataset can be written with added labels. This is a slower operation which requires more "
+    "memory and disk space. It is not recommended to run this on dataset that will not fit in memory. This check "
+    "will no longer be required with anndata 0.10.0.",
     is_flag=True,
 )
 def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_check):

From 0564a2e0d1ef9e4c9f0fb011bffb227ed9abbdec Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Mon, 25 Sep 2023 15:36:55 -0700
Subject: [PATCH 08/14] fix tests

---
 cellxgene_schema_cli/tests/test_schema_compliance.py | 4 ++--
 cellxgene_schema_cli/tests/test_validate.py          | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py
index c5d32ac50..9a9735d47 100644
--- a/cellxgene_schema_cli/tests/test_schema_compliance.py
+++ b/cellxgene_schema_cli/tests/test_schema_compliance.py
@@ -889,9 +889,9 @@ def test_nan_values_must_be_rejected(self):
         """
         self.validator.adata.obs.loc[self.validator.adata.obs.index[0], "tissue_ontology_term_id"] = numpy.nan
         self.validator.validate_adata()
-        self.assertEqual(
+        self.assertIn(
+            "ERROR: Column 'tissue_ontology_term_id' in dataframe 'obs' must not contain NaN values.",
             self.validator.errors,
-            ["ERROR: Column 'tissue_ontology_term_id' in dataframe 'obs' must not contain NaN values."],
         )
 
 
diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
index ce8fdf5e6..69c456af8 100644
--- a/cellxgene_schema_cli/tests/test_validate.py
+++ b/cellxgene_schema_cli/tests/test_validate.py
@@ -272,9 +272,7 @@ def test__validate_with_h5ad_valid_and_labels(self):
         with tempfile.TemporaryDirectory() as temp_dir:
             labels_path = "/".join([temp_dir, "labels.h5ad"])
 
-            success, errors, is_seurat_convertible = validate(
-                "/Users/trentsmith/workspace/single-cell-curation/example_addlabelsFail.h5ad", labels_path
-            )
+            success, errors, is_seurat_convertible = validate(h5ad_valid, labels_path)
 
             import anndata as ad
 

From 738d85d932beb414a8a822722c1ad3180bcffe8c Mon Sep 17 00:00:00 2001
From: Trent Smith <1429913+Bento007@users.noreply.github.com>
Date: Tue, 26 Sep 2023 13:23:05 -0700
Subject: [PATCH 09/14] Update
 cellxgene_schema_cli/cellxgene_schema/validate.py

---
 cellxgene_schema_cli/cellxgene_schema/validate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
index 7aa37df79..14ae2c3a9 100644
--- a/cellxgene_schema_cli/cellxgene_schema/validate.py
+++ b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -651,7 +651,7 @@ def _validate_dataframe(self, df_name: str):
                 value_types = {type(x) for x in column.values}
                 if len(value_types) != 1:
                     self.errors.append(
-                        f"Column '{column_name}' in dataframe '{df_name}' cannot contained mixed types. Found {value_types}."
+                        f"Column '{column_name}' in dataframe '{df_name}' cannot contain mixed types. Found {value_types}."
                     )
             else:
                 # Check for columns that have a category defined 0 times (obs only)

From 5645fb6b4a9e259ffc1530b89037d8cada294fe0 Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Tue, 26 Sep 2023 16:29:02 -0700
Subject: [PATCH 10/14] fix typo

---
 cellxgene_schema_cli/cellxgene_schema/validate.py | 8 ++++----
 cellxgene_schema_cli/tests/test_validate.py       | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
index 14ae2c3a9..c40467fff 100644
--- a/cellxgene_schema_cli/cellxgene_schema/validate.py
+++ b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -662,11 +662,11 @@ def _validate_dataframe(self, df_name: str):
                                 f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with "
                                 f"zero observations. These categories will be removed when `--add-labels` flag is present."
                             )
-                # Check for columns that have none string catagories
-                catagory_types = {type(x) for x in column.dtype.categories.values}
-                if len(catagory_types) > 1 or str not in catagory_types:
+                # Check for columns that have none string categories
+                category_types = {type(x) for x in column.dtype.categories.values}
+                if len(category_types) > 1 or str not in category_types:
                     self.errors.append(
-                        f"Column '{column_name}' in dataframe '{df_name}' must only contain string catagories. Found {catagory_types}."
+                        f"Column '{column_name}' in dataframe '{df_name}' must only contain string categories. Found {category_types}."
                     )
 
         # Validate columns
diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
index 69c456af8..f4d0b75fc 100644
--- a/cellxgene_schema_cli/tests/test_validate.py
+++ b/cellxgene_schema_cli/tests/test_validate.py
@@ -385,7 +385,7 @@ def test_determine_seurat_convertibility(self):
 
 
 class TestValidatorValidateDataFrame:
-    def test_fail_catagory_not_string(self):
+    def test_fail_category_not_string(self):
         validator = Validator()
         validator._set_schema_def()
         adata = adata_valid.copy()
@@ -394,7 +394,7 @@ def test_fail_catagory_not_string(self):
         validator.adata = adata
 
         validator._validate_dataframe("obs")
-        assert "must only contain string catagories." in validator.errors[0]
+        assert "must only contain string categories." in validator.errors[0]
 
     def test_fail_mixed_column_types(self):
         validator = Validator()

From c4d4afd36d58df0092868581539a199336345a9a Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Tue, 26 Sep 2023 16:40:13 -0700
Subject: [PATCH 11/14] fix test

---
 cellxgene_schema_cli/tests/test_validate.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
index f4d0b75fc..ce8511fac 100644
--- a/cellxgene_schema_cli/tests/test_validate.py
+++ b/cellxgene_schema_cli/tests/test_validate.py
@@ -404,7 +404,9 @@ def test_fail_mixed_column_types(self):
         validator.adata = adata
 
         validator._validate_dataframe("obs")
-        assert "cannot contained mixed types." in validator.errors[0]
+        assert [
+            "Column 'mixed' in dataframe 'obs' cannot contain mixed types. Found {<class 'int'>, " "<class 'str'>}."
+        ] == validator.errors
 
 
 class TestValidatorWriteCheck:

From bce67eb4956e60629756f0ba73c4c8281995424e Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Tue, 26 Sep 2023 21:59:19 -0700
Subject: [PATCH 12/14] fix test

---
 cellxgene_schema_cli/tests/test_validate.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
index ce8511fac..5cb73f291 100644
--- a/cellxgene_schema_cli/tests/test_validate.py
+++ b/cellxgene_schema_cli/tests/test_validate.py
@@ -404,9 +404,7 @@ def test_fail_mixed_column_types(self):
         validator.adata = adata
 
         validator._validate_dataframe("obs")
-        assert [
-            "Column 'mixed' in dataframe 'obs' cannot contain mixed types. Found {<class 'int'>, " "<class 'str'>}."
-        ] == validator.errors
+        assert "Column 'mixed' in dataframe 'obs' cannot contain mixed types." in validator.errors[0]
 
 
 class TestValidatorWriteCheck:

From 221ef8c834769459a0d5dc311530f4c51da48c7b Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Wed, 27 Sep 2023 10:13:32 -0700
Subject: [PATCH 13/14] add comments

---
 cellxgene_schema_cli/cellxgene_schema/validate.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
index c40467fff..042625166 100644
--- a/cellxgene_schema_cli/cellxgene_schema/validate.py
+++ b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -647,7 +647,8 @@ def _validate_dataframe(self, df_name: str):
         for column_name in df.columns:
             column = df[column_name]
             if column.dtype != "category":
-                # Check for columns with mixed values.
+                # Check for columns with mixed values, which is not supported by anndata 0.8.0
+                # TODO: check if this can be removed after upgading to anndata 0.10.0
                 value_types = {type(x) for x in column.values}
                 if len(value_types) != 1:
                     self.errors.append(
@@ -662,7 +663,8 @@ def _validate_dataframe(self, df_name: str):
                                 f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with "
                                 f"zero observations. These categories will be removed when `--add-labels` flag is present."
                             )
-                # Check for columns that have none string categories
+                # Check for columns that have none string categories, which is not supported by anndata 0.8.0
+                # TODO: check if this can be removed after upgading to anndata 0.10.0
                 category_types = {type(x) for x in column.dtype.categories.values}
                 if len(category_types) > 1 or str not in category_types:
                     self.errors.append(

From 7a9b7a3c4eef16c4912cc175de7e405666acd378 Mon Sep 17 00:00:00 2001
From: Trent Smith <trent.smith@chanzuckerberg.com>
Date: Wed, 27 Sep 2023 11:02:12 -0700
Subject: [PATCH 14/14] remove --write-check

---
 cellxgene_schema_cli/cellxgene_schema/cli.py  | 14 ++------
 .../cellxgene_schema/validate.py              | 27 ++-------------
 cellxgene_schema_cli/tests/test_validate.py   | 34 -------------------
 3 files changed, 5 insertions(+), 70 deletions(-)

diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py
index 6338927e6..3d917cf6e 100644
--- a/cellxgene_schema_cli/cellxgene_schema/cli.py
+++ b/cellxgene_schema_cli/cellxgene_schema/cli.py
@@ -32,15 +32,7 @@ def schema_cli():
 )
 @click.option("-i", "--ignore-labels", help="Ignore ontology labels when validating", is_flag=True)
 @click.option("-v", "--verbose", help="When present will set logging level to debug", is_flag=True)
-@click.option(
-    "-w",
-    "--write-check",
-    help="This checks if the dataset can be written with added labels. This is a slower operation which requires more "
-    "memory and disk space. It is not recommended to run this on dataset that will not fit in memory. This check "
-    "will no longer be required with anndata 0.10.0.",
-    is_flag=True,
-)
-def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_check):
+def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose):
     # Imports are very slow so we defer loading until Click arg validation has passed
 
     print("Loading dependencies")
@@ -52,9 +44,7 @@ def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_ch
     print("Loading validator modules")
     from .validate import validate
 
-    is_valid, _, _ = validate(
-        h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose, write_check=write_check
-    )
+    is_valid, _, _ = validate(h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose)
     if is_valid:
         sys.exit(0)
     else:
diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
index 311e3b772..d63fc2a05 100644
--- a/cellxgene_schema_cli/cellxgene_schema/validate.py
+++ b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -1330,16 +1330,13 @@ def _deep_check(self):
                 "fixing current errors."
             )
 
-    def validate_adata(
-        self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False, write_check: bool = False
-    ) -> bool:
+    def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False) -> bool:
         """
         Validates adata
 
         :params Union[str, bytes, os.PathLike] h5ad_path: path to h5ad to validate, if None it will try to validate
         from self.adata
         :param to_memory: indicate if the h5ad should be read into memory.
-        :param write_check: indicate if write_check should be performed.
 
         :return True if successful validation, False otherwise
         :rtype bool
@@ -1361,9 +1358,6 @@ def validate_adata(
         if not self.errors:
             self._deep_check()
 
-        if write_check:
-            self.write_check()
-
         # Print warnings if any
         if self.warnings:
             self.warnings = ["WARNING: " + i for i in self.warnings]
@@ -1381,25 +1375,12 @@ def validate_adata(
 
         return self.is_valid
 
-    def write_check(self):
-        """Check that the h5ad can be written back out."""
-        logger.warning("Checking that the h5ad can be written back out. This may take a while.")
-        import tempfile
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            tmp_path = os.path.join(tmpdir, "validated.h5ad")
-            try:
-                self.adata.write(tmp_path)
-            except Exception as e:
-                self.errors.append(f"Unable to write back to h5ad: {e}")
-
 
 def validate(
     h5ad_path: Union[str, bytes, os.PathLike],
     add_labels_file: str = None,
     ignore_labels: bool = False,
     verbose: bool = False,
-    write_check: bool = False,
 ) -> (bool, list, bool):
     from .write_labels import AnnDataLabelAppender
 
@@ -1423,10 +1404,8 @@ def validate(
     validator = Validator(
         ignore_labels=ignore_labels,
     )
-    to_memory = (
-        add_labels_file is not None or write_check
-    )  # if we're adding labels or doing a write_check, then read to memory
-    validator.validate_adata(h5ad_path, to_memory=to_memory, write_check=write_check)
+    to_memory = add_labels_file is not None
+    validator.validate_adata(h5ad_path, to_memory=to_memory)
     logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}")
 
     # Stop if validation was unsuccessful
diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
index 5cb73f291..d033fb02f 100644
--- a/cellxgene_schema_cli/tests/test_validate.py
+++ b/cellxgene_schema_cli/tests/test_validate.py
@@ -12,7 +12,6 @@
 import pytest
 from cellxgene_schema.ontology import OntologyChecker
 from cellxgene_schema.schema import get_schema_definition
-from cellxgene_schema.utils import read_h5ad
 from cellxgene_schema.validate import Validator, validate
 from cellxgene_schema.write_labels import AnnDataLabelAppender
 from fixtures.examples_validate import (
@@ -316,20 +315,6 @@ def test__validate_with_h5ad_invalid_and_without_labels(self):
         self.assertTrue(errors)
         self.assertTrue(is_seurat_convertible)
 
-    @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad)
-    @mock.patch("cellxgene_schema.validate.Validator.write_check")
-    def test__validate_with_write_check_false(self, mock_write_check, mock_read_h5ad):
-        validate(h5ad_valid)
-        mock_write_check.assert_not_called()
-        mock_read_h5ad.assert_called_with(h5ad_valid, False)
-
-    @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad)
-    @mock.patch("cellxgene_schema.validate.Validator.write_check")
-    def test__validate_with_write_check_true(self, mock_write_check, mock_read_h5ad):
-        validate(h5ad_valid, write_check=True)
-        mock_write_check.assert_called_once()
-        mock_read_h5ad.assert_called_with(h5ad_valid, True)
-
 
 class TestSeuratConvertibility(unittest.TestCase):
     def validation_helper(self, matrix, raw=None):
@@ -407,25 +392,6 @@ def test_fail_mixed_column_types(self):
         assert "Column 'mixed' in dataframe 'obs' cannot contain mixed types." in validator.errors[0]
 
 
-class TestValidatorWriteCheck:
-    def test_fail(self):
-        validator = Validator()
-        X = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]])
-        obs = pd.DataFrame([["red", 1, 0.22222], ["blue", 0, np.nan], ["orange", 1, 0.1]])
-        adata = anndata.AnnData(X=X, obs=obs)
-        validator.adata = adata
-
-        validator.write_check()
-        assert "Unable to write back to h5ad" in validator.errors[0]
-
-    def test_success(self):
-        validator = Validator()
-        validator.adata = adata_minimal.copy()
-
-        validator.write_check()
-        assert len(validator.errors) == 0
-
-
 class TestIsRaw:
     @staticmethod
     def create_validator(data: Union[ndarray, spmatrix], format: str) -> Validator: