-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add feature length field, annotated by add-labels function #624
Changes from all commits
f6bb232
78c558f
fb60f0b
5cea47c
3d8cb5c
e0d430e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1010,6 +1010,29 @@ def test_should_warn_for_low_gene_count(self): | |
["WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix."], | ||
) | ||
|
||
def test_add_label_fields_are_reserved(self): | ||
""" | ||
Raise an error if column names flagged as 'add_label' -> 'to_column' in the schema definition are not available. | ||
""" | ||
for df in ["var", "raw.var"]: | ||
for i in self.validator.schema_def["components"][df]["index"]["add_labels"]: | ||
column = i["to_column"] | ||
with self.subTest(column=column, df=df): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (repeat from another PR) let's sync on this today--this whole module of schema compliance tests uses unittest libraries / syntax, so switching over to pytest I think is out of scope for this ticket + and the other open issues. But we can discuss if it's worth ticketing separately to swap everything over. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://app.zenhub.com/workspaces/single-cell-5e2a191dad828d52cc78b028/issues/gh/chanzuckerberg/single-cell-curation/633 tracking a separate issue for this |
||
# Resetting validator | ||
self.validator.adata = examples.adata.copy() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe these 2 lines can be removed if you switch to a pytest approach. |
||
self.validator.errors = [] | ||
|
||
component = getattr_anndata(self.validator.adata, df) | ||
component[column] = "dummy_value" | ||
self.validator.validate_adata() | ||
self.assertEqual( | ||
self.validator.errors, | ||
[ | ||
f"ERROR: Add labels error: Column '{column}' is a reserved column name " | ||
f"of '{df}'. Remove it from h5ad and try again." | ||
], | ||
) | ||
|
||
|
||
class TestUns(BaseValidationTest): | ||
""" | ||
|
@@ -1275,32 +1298,22 @@ def setUpClass(cls): | |
cls.adata_with_labels = examples.adata_with_labels | ||
|
||
# Validate test data | ||
validator = Validator() | ||
validator.adata = examples.adata.copy() | ||
validator.validate_adata() | ||
cls.validator = Validator() | ||
cls.validator.adata = examples.adata.copy() | ||
cls.validator.validate_adata() | ||
|
||
# Add labels through validator | ||
cls.label_writer = AnnDataLabelAppender(validator) | ||
cls.label_writer = AnnDataLabelAppender(cls.validator) | ||
cls.label_writer._add_labels() | ||
|
||
def test_var_added_labels(self): | ||
""" | ||
When a dataset is uploaded, cellxgene Data Portal MUST automatically add the matching human-readable | ||
name for the corresponding feature identifier and the inferred NCBITaxon term for the reference organism | ||
to the var dataframe. Curators MUST NOT annotate the following columns: | ||
|
||
- feature_name. this MUST be a human-readable ENSEMBL gene name or a ERCC Spike-In identifier | ||
appended with " spike-in control", corresponding to the feature_id | ||
- feature_reference. This MUST be the reference organism for a feature: | ||
Homo sapiens "NCBITaxon:9606" | ||
Mus musculus "NCBITaxon:10090" | ||
SARS-CoV-2 "NCBITaxon:2697049" | ||
ERCC Spike-Ins "NCBITaxon:32630" | ||
- feature_biotype. This MUST be "gene" if the feature_id is an ENSEMBL gene, or "spike-in" if the feature_id | ||
is an ERCC Spike-In identifier. | ||
to the var dataframe. Curators MUST NOT annotate the columns below: | ||
""" | ||
|
||
for column in ["feature_name", "feature_reference", "feature_biotype"]: | ||
for i in self.validator.schema_def["components"]["var"]["index"]["add_labels"]: | ||
column = i["to_column"] | ||
expected_column = self.adata_with_labels.var[column] | ||
obtained_column = self.label_writer.adata.var[column] | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -113,7 +113,6 @@ def test_get_dictionary_mapping_feature_id(self): | |
|
||
# Bad | ||
ids = ["NO_GENE"] | ||
expected_dict = dict(zip(ids, labels)) | ||
with self.assertRaises(KeyError): | ||
self.writer._get_mapping_dict_feature_id(ids) | ||
|
||
|
@@ -136,7 +135,29 @@ def test_get_dictionary_mapping_feature_reference(self): | |
|
||
# Bad | ||
ids = ["NO_GENE"] | ||
expected_dict = dict(zip(ids, labels)) | ||
with self.assertRaises(KeyError): | ||
self.writer._get_mapping_dict_feature_id(ids) | ||
|
||
def test_get_dictionary_mapping_feature_length(self): | ||
# Good | ||
ids = [ | ||
"ERCC-00002", | ||
"ENSG00000127603", | ||
"ENSMUSG00000059552", | ||
"ENSSASG00005000004", | ||
] | ||
# values derived from csv | ||
gene_lengths = [ | ||
0, # non-gene feature, so set to 0 regardless of csv value | ||
42738, | ||
4045, | ||
3822, | ||
] | ||
expected_dict = dict(zip(ids, gene_lengths)) | ||
self.assertEqual(self.writer._get_mapping_dict_feature_length(ids), expected_dict) | ||
|
||
# Bad | ||
ids = ["NO_GENE"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For all of the |
||
with self.assertRaises(KeyError): | ||
self.writer._get_mapping_dict_feature_id(ids) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we start putting hardcoded string into a constant.py file? Not sure we have enough cases for it to make sense.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Might be a good thing to do at the end of the epic? Or we can start now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it's necessary at this point, we can revisit if we find that it's coming up frequently and causing confusion.