Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add feature length field, annotated by add-labels function #624

Merged
merged 6 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions cellxgene_schema_cli/cellxgene_schema/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ def __init__(self, species: SupportedOrganisms):
gene = gene.rstrip().split(",")
gene_id = gene[0]
gene_label = gene[1]
gene_length = int(gene[3])

self.gene_dict[gene_id] = gene_label
self.gene_dict[gene_id] = (gene_label, gene_length)

# Keeps track of duplicated gene labels
if gene_label in gene_labels:
Expand All @@ -76,9 +77,9 @@ def __init__(self, species: SupportedOrganisms):
gene_labels.add(gene_label)

# Makes gene labels unique
for gene_id, gene_label in self.gene_dict.items():
for gene_id, (gene_label, gene_length) in self.gene_dict.items():
if gene_label in duplicated_gene_labels:
self.gene_dict[gene_id] = gene_label + "_" + gene_id
self.gene_dict[gene_id] = (gene_label + "_" + gene_id, gene_length)

def is_valid_id(self, gene_id: str) -> bool:
"""
Expand All @@ -105,7 +106,22 @@ def get_symbol(self, gene_id) -> str:
if not self.is_valid_id(gene_id):
raise ValueError(f"The id '{gene_id}' is not a valid ENSEMBL id for '{self.species}'")

return self.gene_dict[gene_id]
return self.gene_dict[gene_id][0]
nayib-jose-gloria marked this conversation as resolved.
Show resolved Hide resolved

def get_length(self, gene_id) -> int:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type gene_id

"""
Gets feature length associated to the ENSEBML id

:param str gene_id: ENSEMBL gene id

:rtype int
:return A gene length
"""

if not self.is_valid_id(gene_id):
Copy link

@danieljhegeman danieljhegeman Sep 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Bento007 has encouraged me to avoid negatives a la if not... where possible

Could turn the last few lines of this method into a one-liner:
return self.gene_dict[gene_id][1] if self.is_valid_id(gene_id) else raise ValueError(f"The id '{gene_id}' is not a valid ENSEMBL id for '{self.species}'")

🙂

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll update to avoid the negative, but I personally find one-liners harder to read

raise ValueError(f"The id '{gene_id}' is not a valid ENSEMBL id for '{self.species}'")

return self.gene_dict[gene_id][1]


class OntologyChecker:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ components:
index:
unique: true
type: feature_id
# Using IDs add two columns: feature_id, and feature_reference
add_labels:
-
type: feature_id
Expand All @@ -67,6 +66,9 @@ components:
-
type: feature_biotype
to_column: feature_biotype
-
type: feature_length
to_column: feature_length
# All columns are required
columns:
feature_is_filtered:
Expand All @@ -78,7 +80,6 @@ components:
index:
unique: true
type: feature_id
# Using IDs add two columns: feature_id, and feature_reference
add_labels:
-
type: feature_id
Expand All @@ -89,6 +90,9 @@ components:
-
type: feature_biotype
to_column: feature_biotype
-
type: feature_length
to_column: feature_length
obs:
type: dataframe
required: null # Means it's required
Expand Down
31 changes: 27 additions & 4 deletions cellxgene_schema_cli/cellxgene_schema/write_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,27 @@ def _get_mapping_dict_feature_biotype(self, ids: List[str]) -> Dict[str, str]:

return mapping_dict

def _get_mapping_dict_feature_length(self, ids: List[str]) -> Dict[str, int]:
"""
Creates a mapping dictionary of feature IDs and feature length, fetching from pre-calculated gene info CSVs
derived from GENCODE mappings for supported organisms. Set to 0 for non-gene features.

:param list[str] ids: feature IDs use for mapping

:return a mapping dictionary: {id: <int>, id: 0, ...}
:rtype dict
"""
mapping_dict = {}

for i in ids:
if i.startswith("ENS"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we start putting hardcoded string into a constant.py file? Not sure we have enough cases for it to make sense.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be a good thing to do at the end of the epic? Or we can start now

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it's necessary at this point, we can revisit if we find that it's coming up frequently and causing confusion.

organism = ontology.get_organism_from_feature_id(i)
mapping_dict[i] = self.validator.gene_checkers[organism].get_length(i)
else:
mapping_dict[i] = 0

return mapping_dict

def _get_labels(
self,
component: str,
Expand Down Expand Up @@ -262,6 +283,9 @@ def _get_labels(
elif label_type == "feature_biotype":
mapping_dict = self._get_mapping_dict_feature_biotype(ids=ids)

elif label_type == "feature_length":
mapping_dict = self._get_mapping_dict_feature_length(ids=ids)

else:
raise TypeError(f"'{label_type}' is not supported in 'add-labels' functionality")

Expand All @@ -286,9 +310,9 @@ def _add_column(self, component: str, column: str, column_definition: dict):
new_column = self._get_labels(component, column, column_definition, label_def["type"])
new_column_name = label_def["to_column"]

# The sintax below is a programtic way to access obs and var in adata:
# The syntax below is a programmatic way to access obs and var in adata:
# adata.__dict__["_obs"] is adata.obs
# "raw.var" requires to levels of programtic access
# "raw.var" requires to levels of programmatic access
if "." in component:
[first_elem, second_elem] = component.split(".")
self.adata.__dict__["_" + first_elem].__dict__["_" + second_elem][new_column_name] = new_column
Expand All @@ -297,8 +321,7 @@ def _add_column(self, component: str, column: str, column_definition: dict):

def _add_labels(self):
"""
From a valid (per cellxgene's schema) adata, this function adds to self.adata ontology/gene labels
to adata.obs, adata.var, and adata.raw.var respectively
Add columns to dataset dataframes based on values in other columns, as defined in schema definition yaml.
"""
for component in ["obs", "var", "raw.var"]:
# If the component does not exist, skip (this is for raw.var)
Expand Down
8 changes: 4 additions & 4 deletions cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
invalid_species = ["Caenorhabditis elegans"]

valid_genes = {
ontology.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": "TP53"},
ontology.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": "Trp53"},
ontology.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": ("TP53", 5676)},
ontology.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": ("Trp53", 4045)},
}

invalid_genes = {
ontology.SupportedOrganisms.HOMO_SAPIENS: ["ENSMUSG00000059552", "GENE"],
ontology.SupportedOrganisms.MUS_MUSCULUS: ["ENSG00000141510", "GENE"],
ontology.SupportedOrganisms.HOMO_SAPIENS: ["ENSMUSG00000059552", ("GENE", 1000)],
ontology.SupportedOrganisms.MUS_MUSCULUS: ["ENSG00000141510", ("GENE", 200)],
}

# For ontology checker
Expand Down
9 changes: 5 additions & 4 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,17 +137,18 @@
# these columns are defined in the schema
var_expected = pd.DataFrame(
[
["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630"],
["gene", False, "MACF1", "NCBITaxon:9606"],
["gene", False, "Trp53", "NCBITaxon:10090"],
["gene", False, "S", "NCBITaxon:2697049"],
["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630", 0],
["gene", False, "MACF1", "NCBITaxon:9606", 42738],
["gene", False, "Trp53", "NCBITaxon:10090", 4045],
["gene", False, "S", "NCBITaxon:2697049", 3822],
],
index=["ERCC-00002", "ENSG00000127603", "ENSMUSG00000059552", "ENSSASG00005000004"],
columns=[
"feature_biotype",
"feature_is_filtered",
"feature_name",
"feature_reference",
"feature_length",
],
)

Expand Down
6 changes: 5 additions & 1 deletion cellxgene_schema_cli/tests/test_ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@ def test_valid_genes(self):
for species in self.valid_genes:
geneChecker = ontology.GeneChecker(species)
for gene_id in self.valid_genes[species]:
gene_label = self.valid_genes[species][gene_id]
gene_label = self.valid_genes[species][gene_id][0]
gene_length = self.valid_genes[species][gene_id][1]

self.assertTrue(geneChecker.is_valid_id(gene_id))
self.assertEqual(geneChecker.get_symbol(gene_id), gene_label)
self.assertEqual(geneChecker.get_length(gene_id), gene_length)

def test_invalid_genes(self):
for species in self.invalid_genes:
Expand All @@ -44,6 +46,8 @@ def test_invalid_genes(self):
self.assertFalse(geneChecker.is_valid_id(gene_id))
with self.assertRaises(ValueError):
geneChecker.get_symbol(gene_id)
with self.assertRaises(ValueError):
geneChecker.get_length(gene_id)


class TestOntologyChecker(unittest.TestCase):
Expand Down
47 changes: 30 additions & 17 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,29 @@ def test_should_warn_for_low_gene_count(self):
["WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix."],
)

def test_add_label_fields_are_reserved(self):
"""
Raise an error if column names flagged as 'add_label' -> 'to_column' in the schema definition are not available.
"""
for df in ["var", "raw.var"]:
for i in self.validator.schema_def["components"][df]["index"]["add_labels"]:
column = i["to_column"]
with self.subTest(column=column, df=df):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(repeat from another PR) let's sync on this today--this whole module of schema compliance tests uses unittest libraries / syntax, so switching over to pytest I think is out of scope for this ticket + and the other open issues. But we can discuss if it's worth ticketing separately to swap everything over.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# Resetting validator
self.validator.adata = examples.adata.copy()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe these 2 lines can be removed if you switch to a pytest approach.

self.validator.errors = []

component = getattr_anndata(self.validator.adata, df)
component[column] = "dummy_value"
self.validator.validate_adata()
self.assertEqual(
self.validator.errors,
[
f"ERROR: Add labels error: Column '{column}' is a reserved column name "
f"of '{df}'. Remove it from h5ad and try again."
],
)


class TestUns(BaseValidationTest):
"""
Expand Down Expand Up @@ -1275,32 +1298,22 @@ def setUpClass(cls):
cls.adata_with_labels = examples.adata_with_labels

# Validate test data
validator = Validator()
validator.adata = examples.adata.copy()
validator.validate_adata()
cls.validator = Validator()
cls.validator.adata = examples.adata.copy()
cls.validator.validate_adata()

# Add labels through validator
cls.label_writer = AnnDataLabelAppender(validator)
cls.label_writer = AnnDataLabelAppender(cls.validator)
cls.label_writer._add_labels()

def test_var_added_labels(self):
"""
When a dataset is uploaded, cellxgene Data Portal MUST automatically add the matching human-readable
name for the corresponding feature identifier and the inferred NCBITaxon term for the reference organism
to the var dataframe. Curators MUST NOT annotate the following columns:

- feature_name. this MUST be a human-readable ENSEMBL gene name or a ERCC Spike-In identifier
appended with " spike-in control", corresponding to the feature_id
- feature_reference. This MUST be the reference organism for a feature:
Homo sapiens "NCBITaxon:9606"
Mus musculus "NCBITaxon:10090"
SARS-CoV-2 "NCBITaxon:2697049"
ERCC Spike-Ins "NCBITaxon:32630"
- feature_biotype. This MUST be "gene" if the feature_id is an ENSEMBL gene, or "spike-in" if the feature_id
is an ERCC Spike-In identifier.
to the var dataframe. Curators MUST NOT annotate the columns below:
"""

for column in ["feature_name", "feature_reference", "feature_biotype"]:
for i in self.validator.schema_def["components"]["var"]["index"]["add_labels"]:
column = i["to_column"]
expected_column = self.adata_with_labels.var[column]
obtained_column = self.label_writer.adata.var[column]

Expand Down
25 changes: 23 additions & 2 deletions cellxgene_schema_cli/tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def test_get_dictionary_mapping_feature_id(self):

# Bad
ids = ["NO_GENE"]
expected_dict = dict(zip(ids, labels))
with self.assertRaises(KeyError):
self.writer._get_mapping_dict_feature_id(ids)

Expand All @@ -136,7 +135,29 @@ def test_get_dictionary_mapping_feature_reference(self):

# Bad
ids = ["NO_GENE"]
expected_dict = dict(zip(ids, labels))
with self.assertRaises(KeyError):
self.writer._get_mapping_dict_feature_id(ids)

def test_get_dictionary_mapping_feature_length(self):
# Good
ids = [
"ERCC-00002",
"ENSG00000127603",
"ENSMUSG00000059552",
"ENSSASG00005000004",
]
# values derived from csv
gene_lengths = [
0, # non-gene feature, so set to 0 regardless of csv value
42738,
4045,
3822,
]
expected_dict = dict(zip(ids, gene_lengths))
self.assertEqual(self.writer._get_mapping_dict_feature_length(ids), expected_dict)

# Bad
ids = ["NO_GENE"]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For all of the test_get_dictionary_* tests in TestAddLabelFunctions, you wish to consider factoring out common dict comparison logic and ids initial variables. Could create a helper function or could simply group all under a single test. Not required though.

with self.assertRaises(KeyError):
self.writer._get_mapping_dict_feature_id(ids)

Expand Down
Loading