diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 55bef409..781b72d1 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -67,7 +67,7 @@ def __init__(self, ignore_labels=False): # keys will be one of gencode.SupportedOrganisms self.gene_checkers = dict() - def reset(self): + def reset(self, hi_res_size: Optional[int] = None, true_mat_size: Optional[int] = None): self.errors = [] self.warnings = [] self.is_valid = False @@ -76,6 +76,8 @@ def reset(self): self.is_spatial = None self.is_visium = None self.is_visium_and_is_single_true = None + self._hires_max_dimension_size = hi_res_size + self._visium_and_is_single_true_matrix_size = true_mat_size # Matrix (e.g., X, raw.X, ...) number non-zero cache self.number_non_zero = dict() @@ -99,6 +101,7 @@ def visium_and_is_single_true_matrix_size(self) -> Optional[int]: if bool( self.adata.obs["assay_ontology_term_id"] .apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM_11M, True)) + .astype(bool) .any() ): self._visium_error_suffix = f"{ERROR_SUFFIX_VISIUM_11M} and {ERROR_SUFFIX_IS_SINGLE}" @@ -118,6 +121,7 @@ def hires_max_dimension_size(self) -> Optional[int]: if bool( self.adata.obs["assay_ontology_term_id"] .apply(lambda t: is_ontological_descendant_of(ONTOLOGY_PARSER, t, ASSAY_VISIUM_11M, True)) + .astype(bool) .any() ): self._visium_error_suffix = ERROR_SUFFIX_VISIUM_11M @@ -1981,6 +1985,7 @@ def _is_visium_including_descendants(self) -> bool: self.adata.obs[_assay_key] .astype("string") .apply(lambda assay: is_ontological_descendant_of(ONTOLOGY_PARSER, assay, ASSAY_VISIUM, True)) + .astype(bool) .any() ) @@ -2099,8 +2104,6 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None) -> bo :rtype bool """ logger.info("Starting validation...") - # Re-start errors in case a new h5ad is being validated - self.reset() if h5ad_path: logger.debug("Reading the h5ad file...") @@ -2108,6 +2111,8 @@ def validate_adata(self, h5ad_path: Union[str, bytes, os.PathLike] = None) -> bo self.h5ad_path = h5ad_path self._validate_encoding_version() logger.debug("Successfully read the h5ad file") + # Re-start errors in case a new h5ad is being validated + self.reset() # Fetches schema def for latest major schema version self._set_schema_def() diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py index 8f8eb801..aa69890e 100644 --- a/cellxgene_schema_cli/tests/test_schema_compliance.py +++ b/cellxgene_schema_cli/tests/test_schema_compliance.py @@ -86,8 +86,8 @@ def validator_with_spatial_and_is_single_false(validator) -> Validator: @pytest.fixture def validator_with_visium_assay(validator) -> Validator: validator.adata = examples.adata_visium.copy() - validator._visium_and_is_single_true_matrix_size = 2 - validator._hires_max_dimension_size = None + validator.reset(None, None) + return validator @@ -208,6 +208,7 @@ def test_raw_values__invalid_spatial(self, validator_with_visium_assay, invalid_ validator = validator_with_visium_assay validator.adata.raw.X[0, 1] = invalid_value + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: All non-zero values in raw matrix must be positive integers of type numpy.float32.", @@ -248,7 +249,8 @@ def test_raw_values__contains_zero_row_in_tissue_1(self, validator_with_visium_a Raw Matrix contains a row with all zeros and in_tissue is 1, but no values are in_tissue 0. """ - validator = validator_with_visium_assay + validator: Validator = validator_with_visium_assay + validator.reset(None, 2) validator.adata.obs["in_tissue"] = 1 validator.adata.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) validator.adata.raw.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) @@ -266,6 +268,7 @@ def test_raw_values__contains_zero_row_in_tissue_1_mixed_in_tissue_values(self, validator: Validator = validator_with_visium_assay validator.adata.X[1] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) validator.adata.raw.X[1] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: Each observation with obs['in_tissue'] == 1 must have at least one " @@ -287,6 +290,7 @@ def test_raw_values__contains_all_zero_rows_in_tissue_0(self, validator_with_vis ) validator.adata.raw = validator.adata.copy() validator.adata.raw.var.drop("feature_is_filtered", axis=1, inplace=True) + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: If obs['in_tissue'] contains at least one value 0, then there must be at least " @@ -305,6 +309,7 @@ def test_raw_values__contains_some_zero_rows_in_tissue_0(self, validator_with_vi validator.adata.obs["cell_type_ontology_term_id"] = "unknown" validator.adata.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) validator.adata.raw.X[0] = numpy.zeros(validator.adata.var.shape[0], dtype=numpy.float32) + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [] @@ -329,8 +334,6 @@ def test_raw_values__invalid_visium_and_is_single_true_row_length( validator.adata.obs["assay_ontology_term_id"] = assay_ontology_term_id # hires image size must be present in order to validate the raw. - validator._visium_and_is_single_true_matrix_size = None - validator._hires_max_dimension_size = image_size validator.adata.uns["spatial"][visium_library_id]["images"]["hires"] = numpy.zeros( (1, image_size, 3), dtype=numpy.uint8 ) @@ -641,13 +644,16 @@ def test_assay_ontology_term_id__as_categorical(self, validator_with_visium_assa validator: Validator = validator_with_visium_assay # check encoding as string - validator._check_spatial_obs() + validator.reset(None, 2) + validator._check_spatial() + validator._validate_raw() assert validator.errors == [] - validator.reset() # force encoding as 'categorical' + validator.reset(None, 2) validator.adata.obs["assay_ontology_term_id"] = validator.adata.obs["assay_ontology_term_id"].astype("category") - validator._check_spatial_obs() + validator._check_spatial() + validator._validate_raw() assert validator.errors == [] @pytest.mark.parametrize( @@ -1721,6 +1727,7 @@ def test_genetic_ancestry_same_donor_id(self, validator_with_adata): # Second row should have identical donor id + genetic ancestry values, so this should pass validation validator.adata.obs.iloc[1] = validator.adata.obs.iloc[0].values + validator.validate_adata() assert validator.errors == [] @@ -1731,11 +1738,13 @@ def test_genetic_ancestry_same_donor_id(self, validator_with_adata): validator.adata.obs["genetic_ancestry_Indigenous_American"] = [0.0, 0.0] validator.adata.obs["genetic_ancestry_Oceanian"] = [0.0, 0.0] validator.adata.obs["genetic_ancestry_South_Asian"] = [0.0, 0.0] + validator.reset(None, 2) validator.validate_adata() assert len(validator.errors) > 0 # Change the donor id back to two different donor id's. Now, this should pass validation validator.adata.obs["donor_id"] = original_donor_id_column + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [] @@ -1818,6 +1827,7 @@ def test_feature_is_filtered(self, validator_with_adata): X[i, 0] = 0 X[0, 0] = 1 + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: Some features are 'True' in 'feature_is_filtered' of dataframe 'var', " @@ -1827,6 +1837,7 @@ def test_feature_is_filtered(self, validator_with_adata): # Test that feature_is_filtered is a bool and not a string var["feature_is_filtered"] = "string" + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: Column 'feature_is_filtered' in dataframe 'var' must be boolean, not 'object'." @@ -2406,6 +2417,7 @@ def test_obsm_values_nan(self, validator_with_visium_assay, key): # Check embedding has any NaN obsm[key][0:100, 1] = numpy.nan + validator.reset(None, 2) validator.validate_adata() if key != "spatial": @@ -2416,6 +2428,7 @@ def test_obsm_values_nan(self, validator_with_visium_assay, key): # Check embedding has all NaNs all_nan = numpy.full(obsm[key].shape, numpy.nan) obsm[key] = all_nan + validator.reset(None, 2) validator.validate_adata() if key != "spatial": assert validator.errors == [f"ERROR: adata.obsm['{key}'] contains all NaN values."] @@ -2442,6 +2455,7 @@ def test_obsm_values_no_X_embedding__visium_dataset(self, validator_with_visium_ validator = validator_with_visium_assay validator.adata.uns["default_embedding"] = "spatial" del validator.adata.obsm["X_umap"] + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [] assert validator.is_spatial is True @@ -2545,6 +2559,7 @@ def test_obsm_key_name_whitespace(self, validator_with_adata): del obsm["X_ umap"] obsm["u m a p"] = obsm["X_umap"] + validator.reset(None, 2) validator.validate_adata() assert validator.errors == [ "ERROR: Embedding key in 'adata.obsm' u m a p does not match the regex pattern ^[a-zA-Z][a-zA-Z0-9_.-]*$."