Skip to content

Commit

Permalink
feat: add feature length field, annotated by add-labels function
Browse files Browse the repository at this point in the history
  • Loading branch information
nayib-jose-gloria committed Sep 15, 2023
1 parent ccad8d9 commit f6bb232
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 13 deletions.
24 changes: 20 additions & 4 deletions cellxgene_schema_cli/cellxgene_schema/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ def __init__(self, species: SupportedOrganisms):
gene = gene.rstrip().split(",")
gene_id = gene[0]
gene_label = gene[1]
gene_length = int(gene[3])

self.gene_dict[gene_id] = gene_label
self.gene_dict[gene_id] = (gene_label, gene_length)

# Keeps track of duplicated gene labels
if gene_label in gene_labels:
Expand All @@ -76,9 +77,9 @@ def __init__(self, species: SupportedOrganisms):
gene_labels.add(gene_label)

# Makes gene labels unique
for gene_id, gene_label in self.gene_dict.items():
for gene_id, (gene_label, gene_length) in self.gene_dict.items():
if gene_label in duplicated_gene_labels:
self.gene_dict[gene_id] = gene_label + "_" + gene_id
self.gene_dict[gene_id] = (gene_label + "_" + gene_id, gene_length)

def is_valid_id(self, gene_id: str) -> bool:
"""
Expand All @@ -105,7 +106,22 @@ def get_symbol(self, gene_id) -> str:
if not self.is_valid_id(gene_id):
raise ValueError(f"The id '{gene_id}' is not a valid ENSEMBL id for '{self.species}'")

return self.gene_dict[gene_id]
return self.gene_dict[gene_id][0]

def get_length(self, gene_id) -> int:
"""
Gets feature length associated to the ENSEBML id
:param str gene_id: ENSEMBL gene id
:rtype int
:return A gene length
"""

if not self.is_valid_id(gene_id):
raise ValueError(f"The id '{gene_id}' is not a valid ENSEMBL id for '{self.species}'")

return self.gene_dict[gene_id][1]


class OntologyChecker:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ components:
index:
unique: true
type: feature_id
# Using IDs add two columns: feature_id, and feature_reference
add_labels:
-
type: feature_id
Expand All @@ -67,6 +66,9 @@ components:
-
type: feature_biotype
to_column: feature_biotype
-
type: feature_length
to_column: feature_length
# All columns are required
columns:
feature_is_filtered:
Expand All @@ -78,7 +80,6 @@ components:
index:
unique: true
type: feature_id
# Using IDs add two columns: feature_id, and feature_reference
add_labels:
-
type: feature_id
Expand All @@ -89,6 +90,9 @@ components:
-
type: feature_biotype
to_column: feature_biotype
-
type: feature_length
to_column: feature_length
obs:
type: dataframe
required: null # Means it's required
Expand Down
24 changes: 24 additions & 0 deletions cellxgene_schema_cli/cellxgene_schema/write_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,27 @@ def _get_mapping_dict_feature_biotype(self, ids: List[str]) -> Dict[str, str]:

return mapping_dict

def _get_mapping_dict_feature_length(self, ids: List[str]) -> Dict[str, int]:
"""
Creates a mapping dictionary of feature IDs and feature length, fetching from pre-calculated gene info CSVs
derived from GENCODE mappings for supported organisms. Set to 0 for non-gene features.
:param list[str] ids: feature IDs use for mapping
:return a mapping dictionary: {id: <int>, id: 0, ...}
:rtype dict
"""
mapping_dict = {}

for i in ids:
if i.startswith("ENS"):
organism = ontology.get_organism_from_feature_id(i)
mapping_dict[i] = self.validator.gene_checkers[organism].get_length(i)
else:
mapping_dict[i] = 0

return mapping_dict

def _get_labels(
self,
component: str,
Expand Down Expand Up @@ -262,6 +283,9 @@ def _get_labels(
elif label_type == "feature_biotype":
mapping_dict = self._get_mapping_dict_feature_biotype(ids=ids)

elif label_type == "feature_length":
mapping_dict = self._get_mapping_dict_feature_length(ids=ids)

else:
raise TypeError(f"'{label_type}' is not supported in 'add-labels' functionality")

Expand Down
8 changes: 4 additions & 4 deletions cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
invalid_species = ["Caenorhabditis elegans"]

valid_genes = {
ontology.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": "TP53"},
ontology.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": "Trp53"},
ontology.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": ("TP53", 5676)},
ontology.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": ("Trp53", 4045)},
}

invalid_genes = {
ontology.SupportedOrganisms.HOMO_SAPIENS: ["ENSMUSG00000059552", "GENE"],
ontology.SupportedOrganisms.MUS_MUSCULUS: ["ENSG00000141510", "GENE"],
ontology.SupportedOrganisms.HOMO_SAPIENS: ["ENSMUSG00000059552", ("GENE", 1000)],
ontology.SupportedOrganisms.MUS_MUSCULUS: ["ENSG00000141510", ("GENE", 200)],
}

# For ontology checker
Expand Down
6 changes: 5 additions & 1 deletion cellxgene_schema_cli/tests/test_ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@ def test_valid_genes(self):
for species in self.valid_genes:
geneChecker = ontology.GeneChecker(species)
for gene_id in self.valid_genes[species]:
gene_label = self.valid_genes[species][gene_id]
gene_label = self.valid_genes[species][gene_id][0]
gene_length = self.valid_genes[species][gene_id][1]

self.assertTrue(geneChecker.is_valid_id(gene_id))
self.assertEqual(geneChecker.get_symbol(gene_id), gene_label)
self.assertEqual(geneChecker.get_length(gene_id), gene_length)

def test_invalid_genes(self):
for species in self.invalid_genes:
Expand All @@ -44,6 +46,8 @@ def test_invalid_genes(self):
self.assertFalse(geneChecker.is_valid_id(gene_id))
with self.assertRaises(ValueError):
geneChecker.get_symbol(gene_id)
with self.assertRaises(ValueError):
geneChecker.get_length(gene_id)


class TestOntologyChecker(unittest.TestCase):
Expand Down
25 changes: 23 additions & 2 deletions cellxgene_schema_cli/tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def test_get_dictionary_mapping_feature_id(self):

# Bad
ids = ["NO_GENE"]
expected_dict = dict(zip(ids, labels))
with self.assertRaises(KeyError):
self.writer._get_mapping_dict_feature_id(ids)

Expand All @@ -136,7 +135,29 @@ def test_get_dictionary_mapping_feature_reference(self):

# Bad
ids = ["NO_GENE"]
expected_dict = dict(zip(ids, labels))
with self.assertRaises(KeyError):
self.writer._get_mapping_dict_feature_id(ids)

def test_get_dictionary_mapping_feature_length(self):
# Good
ids = [
"ERCC-00002",
"ENSG00000127603",
"ENSMUSG00000059552",
"ENSSASG00005000004",
]
# values derived from csv
gene_lengths = [
0, # non-gene feature, so set to 0 regardless of csv value
42738,
4045,
3822,
]
expected_dict = dict(zip(ids, gene_lengths))
self.assertEqual(self.writer._get_mapping_dict_feature_length(ids), expected_dict)

# Bad
ids = ["NO_GENE"]
with self.assertRaises(KeyError):
self.writer._get_mapping_dict_feature_id(ids)

Expand Down

0 comments on commit f6bb232

Please sign in to comment.