Skip to content

Commit

Permalink
add more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
nayib-jose-gloria committed Sep 15, 2023
1 parent f6bb232 commit 78c558f
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 21 deletions.
9 changes: 5 additions & 4 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,17 +137,18 @@
# these columns are defined in the schema
var_expected = pd.DataFrame(
[
["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630"],
["gene", False, "MACF1", "NCBITaxon:9606"],
["gene", False, "Trp53", "NCBITaxon:10090"],
["gene", False, "S", "NCBITaxon:2697049"],
["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630", 0],
["gene", False, "MACF1", "NCBITaxon:9606", 42738],
["gene", False, "Trp53", "NCBITaxon:10090", 4045],
["gene", False, "S", "NCBITaxon:2697049", 3822],
],
index=["ERCC-00002", "ENSG00000127603", "ENSMUSG00000059552", "ENSSASG00005000004"],
columns=[
"feature_biotype",
"feature_is_filtered",
"feature_name",
"feature_reference",
"feature_length",
],
)

Expand Down
46 changes: 29 additions & 17 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,28 @@ def test_should_warn_for_low_gene_count(self):
["WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix."],
)

def test_add_label_fields_are_reserved(self):
"""
Raise an error if column names flagged as 'add_label' -> 'to_column' in the schema definition are not available.
"""
for df in ["var", "raw.var"]:
for i in self.validator.schema_def["components"][df]["index"]["add_labels"]:
column = i["to_column"]
with self.subTest(column=column, df=df):
# Resetting validator
self.validator.adata = examples.adata.copy()
self.validator.errors = []
component = getattr_anndata(self.validator.adata, df)
component[column] = "dummy_value"
self.validator.validate_adata()
self.assertEqual(
self.validator.errors,
[
f"ERROR: Add labels error: Column '{column}' is a reserved column name "
f"of '{df}'. Remove it from h5ad and try again."
],
)


class TestUns(BaseValidationTest):
"""
Expand Down Expand Up @@ -1275,32 +1297,22 @@ def setUpClass(cls):
cls.adata_with_labels = examples.adata_with_labels

# Validate test data
validator = Validator()
validator.adata = examples.adata.copy()
validator.validate_adata()
cls.validator = Validator()
cls.validator.adata = examples.adata.copy()
cls.validator.validate_adata()

# Add labels through validator
cls.label_writer = AnnDataLabelAppender(validator)
cls.label_writer = AnnDataLabelAppender(cls.validator)
cls.label_writer._add_labels()

def test_var_added_labels(self):
"""
When a dataset is uploaded, cellxgene Data Portal MUST automatically add the matching human-readable
name for the corresponding feature identifier and the inferred NCBITaxon term for the reference organism
to the var dataframe. Curators MUST NOT annotate the following columns:
- feature_name. this MUST be a human-readable ENSEMBL gene name or a ERCC Spike-In identifier
appended with " spike-in control", corresponding to the feature_id
- feature_reference. This MUST be the reference organism for a feature:
Homo sapiens "NCBITaxon:9606"
Mus musculus "NCBITaxon:10090"
SARS-CoV-2 "NCBITaxon:2697049"
ERCC Spike-Ins "NCBITaxon:32630"
- feature_biotype. This MUST be "gene" if the feature_id is an ENSEMBL gene, or "spike-in" if the feature_id
is an ERCC Spike-In identifier.
to the var dataframe. Curators MUST NOT annotate the columns below:
"""

for column in ["feature_name", "feature_reference", "feature_biotype"]:
for i in self.validator.schema_def["components"]["var"]["index"]["add_labels"]:
column = i["to_column"]
expected_column = self.adata_with_labels.var[column]
obtained_column = self.label_writer.adata.var[column]

Expand Down

0 comments on commit 78c558f

Please sign in to comment.