From 20162bddde0d08278d4ef94a06fc0a42b5531e39 Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Fri, 22 Sep 2023 14:43:21 -0700 Subject: [PATCH 01/14] feat(4.0): Add "write-check" flag to check if labels can be added --- cellxgene_schema_cli/cellxgene_schema/cli.py | 14 ++++++-- .../cellxgene_schema/validate.py | 27 ++++++++++++-- cellxgene_schema_cli/tests/test_validate.py | 35 +++++++++++++++++++ 3 files changed, 71 insertions(+), 5 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py index 3d917cf6e..5f0cea1f2 100644 --- a/cellxgene_schema_cli/cellxgene_schema/cli.py +++ b/cellxgene_schema_cli/cellxgene_schema/cli.py @@ -32,7 +32,15 @@ def schema_cli(): ) @click.option("-i", "--ignore-labels", help="Ignore ontology labels when validating", is_flag=True) @click.option("-v", "--verbose", help="When present will set logging level to debug", is_flag=True) -def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose): +@click.option( + "--write-check", + help="When present will run write_check on of the data. This check if the dataset " + "can be written back out with added labels. This is a slower, more memory and " + "disk intensive operation. It is not recommended to run this on dataset that will " + "not fit in memory.", + is_flag=True, +) +def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_check): # Imports are very slow so we defer loading until Click arg validation has passed print("Loading dependencies") @@ -44,7 +52,9 @@ def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose): print("Loading validator modules") from .validate import validate - is_valid, _, _ = validate(h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose) + is_valid, _, _ = validate( + h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose, write_check=write_check + ) if is_valid: sys.exit(0) else: diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 21b369a63..b7564bbcd 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -1120,13 +1120,16 @@ def _deep_check(self): "fixing current errors." ) - def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False) -> bool: + def validate_adata( + self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False, write_check: bool = False + ) -> bool: """ Validates adata :params Union[str, bytes, os.PathLike] h5ad_path: path to h5ad to validate, if None it will try to validate from self.adata :param to_memory: indicate if the h5ad should be read into memory. + :param write_check: indicate if write_check should be performed. :return True if successful validation, False otherwise :rtype bool @@ -1148,6 +1151,9 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me if not self.errors: self._deep_check() + if write_check: + self.write_check() + # Print warnings if any if self.warnings: self.warnings = ["WARNING: " + i for i in self.warnings] @@ -1165,12 +1171,25 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_me return self.is_valid + def write_check(self): + """Check that the h5ad can be written back out.""" + logger.warning("Checking that the h5ad can be written back out. This may take a while.") + import tempfile + + with tempfile.TemporaryDirectory() as tmpdir: + tmp_path = os.path.join(tmpdir, "validated.h5ad") + try: + self.adata.write(tmp_path) + except Exception as e: + self.errors.append(f"Unable to write back to h5ad: {e}") + def validate( h5ad_path: Union[str, bytes, os.PathLike], add_labels_file: str = None, ignore_labels: bool = False, verbose: bool = False, + write_check: bool = False, ) -> (bool, list, bool): from .write_labels import AnnDataLabelAppender @@ -1194,8 +1213,10 @@ def validate( validator = Validator( ignore_labels=ignore_labels, ) - to_memory = add_labels_file is not None - validator.validate_adata(h5ad_path, to_memory=to_memory) + to_memory = ( + add_labels_file is not None or write_check + ) # if we're adding labels or doing a full validation, read to memory + validator.validate_adata(h5ad_path, to_memory=to_memory, write_check=write_check) logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}") # Stop if validation was unsuccessful diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index 2f918f7d8..a2653a881 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -8,6 +8,7 @@ import anndata import numpy as np +import pandas as pd import pytest from cellxgene_schema.ontology import OntologyChecker from cellxgene_schema.schema import get_schema_definition @@ -27,6 +28,7 @@ from numpy import ndarray from scipy import sparse from scipy.sparse import spmatrix +from utils import read_h5ad # Tests for internal functions of the Validator and LabelWriter classes. @@ -312,6 +314,20 @@ def test__validate_with_h5ad_invalid_and_without_labels(self): self.assertTrue(errors) self.assertTrue(is_seurat_convertible) + @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad) + @mock.patch("cellxgene_schema.validate.Validator.write_check") + def test__validate_with_write_check_false(self, mock_write_check, mock_read_h5ad): + validate(h5ad_valid) + mock_write_check.assert_not_called() + mock_read_h5ad.assert_called_with(h5ad_valid, False) + + @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad) + @mock.patch("cellxgene_schema.validate.Validator.write_check") + def test__validate_with_write_check_true(self, mock_write_check, mock_read_h5ad): + validate(h5ad_valid, full=True) + mock_write_check.assert_called_once() + mock_read_h5ad.assert_called_with(h5ad_valid, True) + class TestSeuratConvertibility(unittest.TestCase): def validation_helper(self, matrix, raw=None): @@ -366,6 +382,25 @@ def test_determine_seurat_convertibility(self): self.assertFalse(self.validator.is_valid) +class TestValidatorWriteCheck: + def test_fail(self): + validator = Validator() + X = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]]) + obs = pd.DataFrame([["red", 1, 0.22222], ["blue", 0, np.nan], ["orange", 1, 0.1]]) + adata = anndata.AnnData(X=X, obs=obs) + validator.adata = adata + + validator.write_check() + assert "Unable to write back to h5ad" in validator.errors[0] + + def test_success(self): + validator = Validator() + validator.adata = h5ad_valid + + validator.write_check() + assert len(validator.errors) == 0 + + class TestIsRaw: @staticmethod def create_validator(data: Union[ndarray, spmatrix], format: str) -> Validator: From 9e733070509bcc2e32f46493b6e923012d48e18e Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Fri, 22 Sep 2023 14:49:23 -0700 Subject: [PATCH 02/14] fix import --- cellxgene_schema_cli/tests/test_validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index a2653a881..b2b0c7a07 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -12,6 +12,7 @@ import pytest from cellxgene_schema.ontology import OntologyChecker from cellxgene_schema.schema import get_schema_definition +from cellxgene_schema.utils import read_h5ad from cellxgene_schema.validate import Validator, validate from cellxgene_schema.write_labels import AnnDataLabelAppender from fixtures.examples_validate import ( @@ -28,7 +29,6 @@ from numpy import ndarray from scipy import sparse from scipy.sparse import spmatrix -from utils import read_h5ad # Tests for internal functions of the Validator and LabelWriter classes. From d85e0d5ac05eaac951cd9b0e70d7fec3cbeec2c7 Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Fri, 22 Sep 2023 15:04:32 -0700 Subject: [PATCH 03/14] fix test --- cellxgene_schema_cli/cellxgene_schema/validate.py | 2 +- cellxgene_schema_cli/tests/test_validate.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index b7564bbcd..f4ccb517c 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -1215,7 +1215,7 @@ def validate( ) to_memory = ( add_labels_file is not None or write_check - ) # if we're adding labels or doing a full validation, read to memory + ) # if we're adding labels or doing a write_check, then read to memory validator.validate_adata(h5ad_path, to_memory=to_memory, write_check=write_check) logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}") diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index b2b0c7a07..da94ed5e1 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -324,7 +324,7 @@ def test__validate_with_write_check_false(self, mock_write_check, mock_read_h5ad @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad) @mock.patch("cellxgene_schema.validate.Validator.write_check") def test__validate_with_write_check_true(self, mock_write_check, mock_read_h5ad): - validate(h5ad_valid, full=True) + validate(h5ad_valid, write_check=True) mock_write_check.assert_called_once() mock_read_h5ad.assert_called_with(h5ad_valid, True) From 6ddc7d550ca828ac24697aa92889ad0a8515fc17 Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Mon, 25 Sep 2023 10:06:25 -0700 Subject: [PATCH 04/14] fix test --- cellxgene_schema_cli/cellxgene_schema/cli.py | 3 ++- cellxgene_schema_cli/tests/test_validate.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py index 5f0cea1f2..c85264852 100644 --- a/cellxgene_schema_cli/cellxgene_schema/cli.py +++ b/cellxgene_schema_cli/cellxgene_schema/cli.py @@ -33,11 +33,12 @@ def schema_cli(): @click.option("-i", "--ignore-labels", help="Ignore ontology labels when validating", is_flag=True) @click.option("-v", "--verbose", help="When present will set logging level to debug", is_flag=True) @click.option( + "-w", "--write-check", help="When present will run write_check on of the data. This check if the dataset " "can be written back out with added labels. This is a slower, more memory and " "disk intensive operation. It is not recommended to run this on dataset that will " - "not fit in memory.", + "not fit in memory. This check will no longer be required with anndata 0.10.0.", is_flag=True, ) def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_check): diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index da94ed5e1..7728eae2f 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -395,7 +395,7 @@ def test_fail(self): def test_success(self): validator = Validator() - validator.adata = h5ad_valid + validator.adata = adata_minimal.copy() validator.write_check() assert len(validator.errors) == 0 From d33494f40a56b70190f8a8636d2d6364929d4c45 Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Mon, 25 Sep 2023 15:12:46 -0700 Subject: [PATCH 05/14] Add catches for known cases where the h5ad might fail to write. - Having mixed types in a column - Having none string values for Catagories --- .../cellxgene_schema/validate.py | 34 +++++++++++------- cellxgene_schema_cli/tests/test_validate.py | 35 ++++++++++++++++--- 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index f4ccb517c..5720206f9 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -643,18 +643,28 @@ def _validate_dataframe(self, df_name: str): f"Features SHOULD NOT be filtered from expression matrix." ) - # Check for columns that have a category defined 0 times (obs only) - if df_name == "obs": - for column in df.columns: - col = df[column] - if col.dtype != "category": - continue - for category in col.dtype.categories: - if category not in col.values: - self.warnings.append( - f"Column '{column}' in dataframe '{df_name}' contains a category '{category}' with " - f"zero observations. These categories will be removed when `--add-labels` flag is present." - ) + for column_name in df.columns: + column = df[column_name] + if column.dtype != "category": + value_types = {type(x) for x in column.values} + if len(value_types) != 1: + self.errors.append( + f"Column '{column_name}' in dataframe '{df_name}' cannot contained mixed types. Found {value_types}." + ) + else: + # Check for columns that have a category defined 0 times (obs only) + if df_name == "obs": + for category in column.dtype.categories: + if category not in column.values: + self.warnings.append( + f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with " + f"zero observations. These categories will be removed when `--add-labels` flag is present." + ) + catagory_types = {type(x) for x in column.dtype.categories.values} + if len(catagory_types) > 1 or str not in catagory_types: + self.errors.append( + f"Column '{column_name}' in dataframe '{df_name}' must only contain string catagories. Found {catagory_types}." + ) # Validate columns if "columns" in df_definition: diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index 7728eae2f..ce8fdf5e6 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -16,7 +16,9 @@ from cellxgene_schema.validate import Validator, validate from cellxgene_schema.write_labels import AnnDataLabelAppender from fixtures.examples_validate import ( - adata, + adata as adata_valid, +) +from fixtures.examples_validate import ( adata_minimal, adata_with_labels, good_obs, @@ -99,7 +101,7 @@ def test_validate_ontology_wrong_term(self): class TestAddLabelFunctions(unittest.TestCase): def setUp(self): # Set up test data - self.test_adata = adata.copy() + self.test_adata = adata_valid.copy() self.test_adata_with_labels = adata_with_labels self.schema_def = get_schema_definition() @@ -237,7 +239,7 @@ def test__write__Fail(self): class TestIgnoreLabelFunctions(unittest.TestCase): def setUp(self): # Set up test data - self.test_adata = adata + self.test_adata = adata_valid.copy() self.test_adata_with_labels = adata_with_labels def test_validating_labeled_h5ad_should_fail_if_no_flag_set(self): @@ -270,7 +272,9 @@ def test__validate_with_h5ad_valid_and_labels(self): with tempfile.TemporaryDirectory() as temp_dir: labels_path = "/".join([temp_dir, "labels.h5ad"]) - success, errors, is_seurat_convertible = validate(h5ad_valid, labels_path) + success, errors, is_seurat_convertible = validate( + "/Users/trentsmith/workspace/single-cell-curation/example_addlabelsFail.h5ad", labels_path + ) import anndata as ad @@ -382,6 +386,29 @@ def test_determine_seurat_convertibility(self): self.assertFalse(self.validator.is_valid) +class TestValidatorValidateDataFrame: + def test_fail_catagory_not_string(self): + validator = Validator() + validator._set_schema_def() + adata = adata_valid.copy() + t = pd.CategoricalDtype(categories=[True, False]) + adata.obs["not_string"] = pd.Series(data=[True, False], index=["X", "Y"], dtype=t) + validator.adata = adata + + validator._validate_dataframe("obs") + assert "must only contain string catagories." in validator.errors[0] + + def test_fail_mixed_column_types(self): + validator = Validator() + validator._set_schema_def() + adata = adata_valid.copy() + adata.obs["mixed"] = pd.Series(data=["1234", 0], index=["X", "Y"]) + validator.adata = adata + + validator._validate_dataframe("obs") + assert "cannot contained mixed types." in validator.errors[0] + + class TestValidatorWriteCheck: def test_fail(self): validator = Validator() From 554d5aa3a4520045c81e3a999a142930261487be Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Mon, 25 Sep 2023 15:21:58 -0700 Subject: [PATCH 06/14] add comments --- cellxgene_schema_cli/cellxgene_schema/cli.py | 5 ++--- cellxgene_schema_cli/cellxgene_schema/validate.py | 2 ++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py index c85264852..73f52b80c 100644 --- a/cellxgene_schema_cli/cellxgene_schema/cli.py +++ b/cellxgene_schema_cli/cellxgene_schema/cli.py @@ -35,9 +35,8 @@ def schema_cli(): @click.option( "-w", "--write-check", - help="When present will run write_check on of the data. This check if the dataset " - "can be written back out with added labels. This is a slower, more memory and " - "disk intensive operation. It is not recommended to run this on dataset that will " + help="This checks if the dataset can be written with added labels. This is a slower operation which requires more memory and " + "disk space. It is not recommended to run this on dataset that will " "not fit in memory. This check will no longer be required with anndata 0.10.0.", is_flag=True, ) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 5720206f9..13e6458d8 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -646,6 +646,7 @@ def _validate_dataframe(self, df_name: str): for column_name in df.columns: column = df[column_name] if column.dtype != "category": + # Check for columns with mixed values. value_types = {type(x) for x in column.values} if len(value_types) != 1: self.errors.append( @@ -660,6 +661,7 @@ def _validate_dataframe(self, df_name: str): f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with " f"zero observations. These categories will be removed when `--add-labels` flag is present." ) + # Check for columns that have none string catagories catagory_types = {type(x) for x in column.dtype.categories.values} if len(catagory_types) > 1 or str not in catagory_types: self.errors.append( From 180c60b77cefa960eb20ced0c60573a564ec08d6 Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Mon, 25 Sep 2023 15:22:42 -0700 Subject: [PATCH 07/14] add comments --- cellxgene_schema_cli/cellxgene_schema/cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py index 73f52b80c..6338927e6 100644 --- a/cellxgene_schema_cli/cellxgene_schema/cli.py +++ b/cellxgene_schema_cli/cellxgene_schema/cli.py @@ -35,9 +35,9 @@ def schema_cli(): @click.option( "-w", "--write-check", - help="This checks if the dataset can be written with added labels. This is a slower operation which requires more memory and " - "disk space. It is not recommended to run this on dataset that will " - "not fit in memory. This check will no longer be required with anndata 0.10.0.", + help="This checks if the dataset can be written with added labels. This is a slower operation which requires more " + "memory and disk space. It is not recommended to run this on dataset that will not fit in memory. This check " + "will no longer be required with anndata 0.10.0.", is_flag=True, ) def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_check): From 0564a2e0d1ef9e4c9f0fb011bffb227ed9abbdec Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Mon, 25 Sep 2023 15:36:55 -0700 Subject: [PATCH 08/14] fix tests --- cellxgene_schema_cli/tests/test_schema_compliance.py | 4 ++-- cellxgene_schema_cli/tests/test_validate.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index c5d32ac50..9a9735d47 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -889,9 +889,9 @@ def test_nan_values_must_be_rejected(self): """ self.validator.adata.obs.loc[self.validator.adata.obs.index[0], "tissue_ontology_term_id"] = numpy.nan self.validator.validate_adata() - self.assertEqual( + self.assertIn( + "ERROR: Column 'tissue_ontology_term_id' in dataframe 'obs' must not contain NaN values.", self.validator.errors, - ["ERROR: Column 'tissue_ontology_term_id' in dataframe 'obs' must not contain NaN values."], ) diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index ce8fdf5e6..69c456af8 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -272,9 +272,7 @@ def test__validate_with_h5ad_valid_and_labels(self): with tempfile.TemporaryDirectory() as temp_dir: labels_path = "/".join([temp_dir, "labels.h5ad"]) - success, errors, is_seurat_convertible = validate( - "/Users/trentsmith/workspace/single-cell-curation/example_addlabelsFail.h5ad", labels_path - ) + success, errors, is_seurat_convertible = validate(h5ad_valid, labels_path) import anndata as ad From 738d85d932beb414a8a822722c1ad3180bcffe8c Mon Sep 17 00:00:00 2001 From: Trent Smith <1429913+Bento007@users.noreply.github.com> Date: Tue, 26 Sep 2023 13:23:05 -0700 Subject: [PATCH 09/14] Update cellxgene_schema_cli/cellxgene_schema/validate.py --- cellxgene_schema_cli/cellxgene_schema/validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 7aa37df79..14ae2c3a9 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -651,7 +651,7 @@ def _validate_dataframe(self, df_name: str): value_types = {type(x) for x in column.values} if len(value_types) != 1: self.errors.append( - f"Column '{column_name}' in dataframe '{df_name}' cannot contained mixed types. Found {value_types}." + f"Column '{column_name}' in dataframe '{df_name}' cannot contain mixed types. Found {value_types}." ) else: # Check for columns that have a category defined 0 times (obs only) From 5645fb6b4a9e259ffc1530b89037d8cada294fe0 Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Tue, 26 Sep 2023 16:29:02 -0700 Subject: [PATCH 10/14] fix typo --- cellxgene_schema_cli/cellxgene_schema/validate.py | 8 ++++---- cellxgene_schema_cli/tests/test_validate.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 14ae2c3a9..c40467fff 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -662,11 +662,11 @@ def _validate_dataframe(self, df_name: str): f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with " f"zero observations. These categories will be removed when `--add-labels` flag is present." ) - # Check for columns that have none string catagories - catagory_types = {type(x) for x in column.dtype.categories.values} - if len(catagory_types) > 1 or str not in catagory_types: + # Check for columns that have none string categories + category_types = {type(x) for x in column.dtype.categories.values} + if len(category_types) > 1 or str not in category_types: self.errors.append( - f"Column '{column_name}' in dataframe '{df_name}' must only contain string catagories. Found {catagory_types}." + f"Column '{column_name}' in dataframe '{df_name}' must only contain string categories. Found {category_types}." ) # Validate columns diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index 69c456af8..f4d0b75fc 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -385,7 +385,7 @@ def test_determine_seurat_convertibility(self): class TestValidatorValidateDataFrame: - def test_fail_catagory_not_string(self): + def test_fail_category_not_string(self): validator = Validator() validator._set_schema_def() adata = adata_valid.copy() @@ -394,7 +394,7 @@ def test_fail_catagory_not_string(self): validator.adata = adata validator._validate_dataframe("obs") - assert "must only contain string catagories." in validator.errors[0] + assert "must only contain string categories." in validator.errors[0] def test_fail_mixed_column_types(self): validator = Validator() From c4d4afd36d58df0092868581539a199336345a9a Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Tue, 26 Sep 2023 16:40:13 -0700 Subject: [PATCH 11/14] fix test --- cellxgene_schema_cli/tests/test_validate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index f4d0b75fc..ce8511fac 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -404,7 +404,9 @@ def test_fail_mixed_column_types(self): validator.adata = adata validator._validate_dataframe("obs") - assert "cannot contained mixed types." in validator.errors[0] + assert [ + "Column 'mixed' in dataframe 'obs' cannot contain mixed types. Found {, " "}." + ] == validator.errors class TestValidatorWriteCheck: From bce67eb4956e60629756f0ba73c4c8281995424e Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Tue, 26 Sep 2023 21:59:19 -0700 Subject: [PATCH 12/14] fix test --- cellxgene_schema_cli/tests/test_validate.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index ce8511fac..5cb73f291 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -404,9 +404,7 @@ def test_fail_mixed_column_types(self): validator.adata = adata validator._validate_dataframe("obs") - assert [ - "Column 'mixed' in dataframe 'obs' cannot contain mixed types. Found {, " "}." - ] == validator.errors + assert "Column 'mixed' in dataframe 'obs' cannot contain mixed types." in validator.errors[0] class TestValidatorWriteCheck: From 221ef8c834769459a0d5dc311530f4c51da48c7b Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Wed, 27 Sep 2023 10:13:32 -0700 Subject: [PATCH 13/14] add comments --- cellxgene_schema_cli/cellxgene_schema/validate.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index c40467fff..042625166 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -647,7 +647,8 @@ def _validate_dataframe(self, df_name: str): for column_name in df.columns: column = df[column_name] if column.dtype != "category": - # Check for columns with mixed values. + # Check for columns with mixed values, which is not supported by anndata 0.8.0 + # TODO: check if this can be removed after upgading to anndata 0.10.0 value_types = {type(x) for x in column.values} if len(value_types) != 1: self.errors.append( @@ -662,7 +663,8 @@ def _validate_dataframe(self, df_name: str): f"Column '{column_name}' in dataframe '{df_name}' contains a category '{category}' with " f"zero observations. These categories will be removed when `--add-labels` flag is present." ) - # Check for columns that have none string categories + # Check for columns that have none string categories, which is not supported by anndata 0.8.0 + # TODO: check if this can be removed after upgading to anndata 0.10.0 category_types = {type(x) for x in column.dtype.categories.values} if len(category_types) > 1 or str not in category_types: self.errors.append( From 7a9b7a3c4eef16c4912cc175de7e405666acd378 Mon Sep 17 00:00:00 2001 From: Trent Smith Date: Wed, 27 Sep 2023 11:02:12 -0700 Subject: [PATCH 14/14] remove --write-check --- cellxgene_schema_cli/cellxgene_schema/cli.py | 14 ++------ .../cellxgene_schema/validate.py | 27 ++------------- cellxgene_schema_cli/tests/test_validate.py | 34 ------------------- 3 files changed, 5 insertions(+), 70 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/cli.py b/cellxgene_schema_cli/cellxgene_schema/cli.py index 6338927e6..3d917cf6e 100644 --- a/cellxgene_schema_cli/cellxgene_schema/cli.py +++ b/cellxgene_schema_cli/cellxgene_schema/cli.py @@ -32,15 +32,7 @@ def schema_cli(): ) @click.option("-i", "--ignore-labels", help="Ignore ontology labels when validating", is_flag=True) @click.option("-v", "--verbose", help="When present will set logging level to debug", is_flag=True) -@click.option( - "-w", - "--write-check", - help="This checks if the dataset can be written with added labels. This is a slower operation which requires more " - "memory and disk space. It is not recommended to run this on dataset that will not fit in memory. This check " - "will no longer be required with anndata 0.10.0.", - is_flag=True, -) -def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_check): +def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose): # Imports are very slow so we defer loading until Click arg validation has passed print("Loading dependencies") @@ -52,9 +44,7 @@ def schema_validate(h5ad_file, add_labels_file, ignore_labels, verbose, write_ch print("Loading validator modules") from .validate import validate - is_valid, _, _ = validate( - h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose, write_check=write_check - ) + is_valid, _, _ = validate(h5ad_file, add_labels_file, ignore_labels=ignore_labels, verbose=verbose) if is_valid: sys.exit(0) else: diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 311e3b772..d63fc2a05 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -1330,16 +1330,13 @@ def _deep_check(self): "fixing current errors." ) - def validate_adata( - self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False, write_check: bool = False - ) -> bool: + def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None, to_memory: bool = False) -> bool: """ Validates adata :params Union[str, bytes, os.PathLike] h5ad_path: path to h5ad to validate, if None it will try to validate from self.adata :param to_memory: indicate if the h5ad should be read into memory. - :param write_check: indicate if write_check should be performed. :return True if successful validation, False otherwise :rtype bool @@ -1361,9 +1358,6 @@ def validate_adata( if not self.errors: self._deep_check() - if write_check: - self.write_check() - # Print warnings if any if self.warnings: self.warnings = ["WARNING: " + i for i in self.warnings] @@ -1381,25 +1375,12 @@ def validate_adata( return self.is_valid - def write_check(self): - """Check that the h5ad can be written back out.""" - logger.warning("Checking that the h5ad can be written back out. This may take a while.") - import tempfile - - with tempfile.TemporaryDirectory() as tmpdir: - tmp_path = os.path.join(tmpdir, "validated.h5ad") - try: - self.adata.write(tmp_path) - except Exception as e: - self.errors.append(f"Unable to write back to h5ad: {e}") - def validate( h5ad_path: Union[str, bytes, os.PathLike], add_labels_file: str = None, ignore_labels: bool = False, verbose: bool = False, - write_check: bool = False, ) -> (bool, list, bool): from .write_labels import AnnDataLabelAppender @@ -1423,10 +1404,8 @@ def validate( validator = Validator( ignore_labels=ignore_labels, ) - to_memory = ( - add_labels_file is not None or write_check - ) # if we're adding labels or doing a write_check, then read to memory - validator.validate_adata(h5ad_path, to_memory=to_memory, write_check=write_check) + to_memory = add_labels_file is not None + validator.validate_adata(h5ad_path, to_memory=to_memory) logger.info(f"Validation complete in {datetime.now() - start} with status is_valid={validator.is_valid}") # Stop if validation was unsuccessful diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py index 5cb73f291..d033fb02f 100644 --- a/cellxgene_schema_cli/tests/test_validate.py +++ b/cellxgene_schema_cli/tests/test_validate.py @@ -12,7 +12,6 @@ import pytest from cellxgene_schema.ontology import OntologyChecker from cellxgene_schema.schema import get_schema_definition -from cellxgene_schema.utils import read_h5ad from cellxgene_schema.validate import Validator, validate from cellxgene_schema.write_labels import AnnDataLabelAppender from fixtures.examples_validate import ( @@ -316,20 +315,6 @@ def test__validate_with_h5ad_invalid_and_without_labels(self): self.assertTrue(errors) self.assertTrue(is_seurat_convertible) - @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad) - @mock.patch("cellxgene_schema.validate.Validator.write_check") - def test__validate_with_write_check_false(self, mock_write_check, mock_read_h5ad): - validate(h5ad_valid) - mock_write_check.assert_not_called() - mock_read_h5ad.assert_called_with(h5ad_valid, False) - - @mock.patch("cellxgene_schema.validate.read_h5ad", wraps=read_h5ad) - @mock.patch("cellxgene_schema.validate.Validator.write_check") - def test__validate_with_write_check_true(self, mock_write_check, mock_read_h5ad): - validate(h5ad_valid, write_check=True) - mock_write_check.assert_called_once() - mock_read_h5ad.assert_called_with(h5ad_valid, True) - class TestSeuratConvertibility(unittest.TestCase): def validation_helper(self, matrix, raw=None): @@ -407,25 +392,6 @@ def test_fail_mixed_column_types(self): assert "Column 'mixed' in dataframe 'obs' cannot contain mixed types." in validator.errors[0] -class TestValidatorWriteCheck: - def test_fail(self): - validator = Validator() - X = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]]) - obs = pd.DataFrame([["red", 1, 0.22222], ["blue", 0, np.nan], ["orange", 1, 0.1]]) - adata = anndata.AnnData(X=X, obs=obs) - validator.adata = adata - - validator.write_check() - assert "Unable to write back to h5ad" in validator.errors[0] - - def test_success(self): - validator = Validator() - validator.adata = adata_minimal.copy() - - validator.write_check() - assert len(validator.errors) == 0 - - class TestIsRaw: @staticmethod def create_validator(data: Union[ndarray, spmatrix], format: str) -> Validator: