feat: add feature length field, annotated by add-labels function

chanzuckerberg · Sep 15, 2023 · f6bb232 · f6bb232
1 parent ccad8d9
commit f6bb232
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 13 deletions.
diff --git a/cellxgene_schema_cli/cellxgene_schema/ontology.py b/cellxgene_schema_cli/cellxgene_schema/ontology.py
@@ -66,8 +66,9 @@ def __init__(self, species: SupportedOrganisms):
                 gene = gene.rstrip().split(",")
                 gene_id = gene[0]
                 gene_label = gene[1]
+                gene_length = int(gene[3])
 
-                self.gene_dict[gene_id] = gene_label
+                self.gene_dict[gene_id] = (gene_label, gene_length)
 
                 # Keeps track of duplicated gene labels
                 if gene_label in gene_labels:
@@ -76,9 +77,9 @@ def __init__(self, species: SupportedOrganisms):
                     gene_labels.add(gene_label)
 
             # Makes gene labels unique
-            for gene_id, gene_label in self.gene_dict.items():
+            for gene_id, (gene_label, gene_length) in self.gene_dict.items():
                 if gene_label in duplicated_gene_labels:
-                    self.gene_dict[gene_id] = gene_label + "_" + gene_id
+                    self.gene_dict[gene_id] = (gene_label + "_" + gene_id, gene_length)
 
     def is_valid_id(self, gene_id: str) -> bool:
         """
@@ -105,7 +106,22 @@ def get_symbol(self, gene_id) -> str:
         if not self.is_valid_id(gene_id):
             raise ValueError(f"The id '{gene_id}' is not a valid ENSEMBL id for '{self.species}'")
 
-        return self.gene_dict[gene_id]
+        return self.gene_dict[gene_id][0]
+
+    def get_length(self, gene_id) -> int:
+        """
+        Gets feature length associated to the ENSEBML id
+
+        :param str gene_id: ENSEMBL gene id
+
+        :rtype int
+        :return A gene length
+        """
+
+        if not self.is_valid_id(gene_id):
+            raise ValueError(f"The id '{gene_id}' is not a valid ENSEMBL id for '{self.species}'")
+
+        return self.gene_dict[gene_id][1]
 
 
 class OntologyChecker:

diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml
@@ -56,7 +56,6 @@ components:
         index:
             unique: true
             type: feature_id
-            # Using IDs add two columns: feature_id, and feature_reference
             add_labels:
                 -
                     type: feature_id
@@ -67,6 +66,9 @@ components:
                 -
                     type: feature_biotype
                     to_column: feature_biotype
+                -
+                    type: feature_length
+                    to_column: feature_length
         # All columns are required
         columns:
             feature_is_filtered:
@@ -78,7 +80,6 @@ components:
         index:
             unique: true
             type: feature_id
-            # Using IDs add two columns: feature_id, and feature_reference
             add_labels:
                 -
                     type: feature_id
@@ -89,6 +90,9 @@ components:
                 -
                     type: feature_biotype
                     to_column: feature_biotype
+                -
+                    type: feature_length
+                    to_column: feature_length
     obs:
         type: dataframe
         required: null # Means it's required

diff --git a/cellxgene_schema_cli/cellxgene_schema/write_labels.py b/cellxgene_schema_cli/cellxgene_schema/write_labels.py
@@ -209,6 +209,27 @@ def _get_mapping_dict_feature_biotype(self, ids: List[str]) -> Dict[str, str]:
 
         return mapping_dict
 
+    def _get_mapping_dict_feature_length(self, ids: List[str]) -> Dict[str, int]:
+        """
+        Creates a mapping dictionary of feature IDs and feature length, fetching from pre-calculated gene info CSVs
+        derived from GENCODE mappings for supported organisms. Set to 0 for non-gene features.
+
+        :param list[str] ids: feature IDs use for mapping
+
+        :return a mapping dictionary: {id: <int>, id: 0, ...}
+        :rtype dict
+        """
+        mapping_dict = {}
+
+        for i in ids:
+            if i.startswith("ENS"):
+                organism = ontology.get_organism_from_feature_id(i)
+                mapping_dict[i] = self.validator.gene_checkers[organism].get_length(i)
+            else:
+                mapping_dict[i] = 0
+
+        return mapping_dict
+
     def _get_labels(
         self,
         component: str,
@@ -262,6 +283,9 @@ def _get_labels(
         elif label_type == "feature_biotype":
             mapping_dict = self._get_mapping_dict_feature_biotype(ids=ids)
 
+        elif label_type == "feature_length":
+            mapping_dict = self._get_mapping_dict_feature_length(ids=ids)
+
         else:
             raise TypeError(f"'{label_type}' is not supported in 'add-labels' functionality")
 

diff --git a/cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py b/cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py
@@ -4,13 +4,13 @@
 invalid_species = ["Caenorhabditis elegans"]
 
 valid_genes = {
-    ontology.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": "TP53"},
-    ontology.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": "Trp53"},
+    ontology.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": ("TP53", 5676)},
+    ontology.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": ("Trp53", 4045)},
 }
 
 invalid_genes = {
-    ontology.SupportedOrganisms.HOMO_SAPIENS: ["ENSMUSG00000059552", "GENE"],
-    ontology.SupportedOrganisms.MUS_MUSCULUS: ["ENSG00000141510", "GENE"],
+    ontology.SupportedOrganisms.HOMO_SAPIENS: ["ENSMUSG00000059552", ("GENE", 1000)],
+    ontology.SupportedOrganisms.MUS_MUSCULUS: ["ENSG00000141510", ("GENE", 200)],
 }
 
 # For ontology checker

diff --git a/cellxgene_schema_cli/tests/test_ontology.py b/cellxgene_schema_cli/tests/test_ontology.py
@@ -32,10 +32,12 @@ def test_valid_genes(self):
         for species in self.valid_genes:
             geneChecker = ontology.GeneChecker(species)
             for gene_id in self.valid_genes[species]:
-                gene_label = self.valid_genes[species][gene_id]
+                gene_label = self.valid_genes[species][gene_id][0]
+                gene_length = self.valid_genes[species][gene_id][1]
 
                 self.assertTrue(geneChecker.is_valid_id(gene_id))
                 self.assertEqual(geneChecker.get_symbol(gene_id), gene_label)
+                self.assertEqual(geneChecker.get_length(gene_id), gene_length)
 
     def test_invalid_genes(self):
         for species in self.invalid_genes:
@@ -44,6 +46,8 @@ def test_invalid_genes(self):
                 self.assertFalse(geneChecker.is_valid_id(gene_id))
                 with self.assertRaises(ValueError):
                     geneChecker.get_symbol(gene_id)
+                with self.assertRaises(ValueError):
+                    geneChecker.get_length(gene_id)
 
 
 class TestOntologyChecker(unittest.TestCase):

diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
@@ -113,7 +113,6 @@ def test_get_dictionary_mapping_feature_id(self):
 
         # Bad
         ids = ["NO_GENE"]
-        expected_dict = dict(zip(ids, labels))
         with self.assertRaises(KeyError):
             self.writer._get_mapping_dict_feature_id(ids)
 
@@ -136,7 +135,29 @@ def test_get_dictionary_mapping_feature_reference(self):
 
         # Bad
         ids = ["NO_GENE"]
-        expected_dict = dict(zip(ids, labels))
+        with self.assertRaises(KeyError):
+            self.writer._get_mapping_dict_feature_id(ids)
+
+    def test_get_dictionary_mapping_feature_length(self):
+        # Good
+        ids = [
+            "ERCC-00002",
+            "ENSG00000127603",
+            "ENSMUSG00000059552",
+            "ENSSASG00005000004",
+        ]
+        # values derived from csv
+        gene_lengths = [
+            0,  # non-gene feature, so set to 0 regardless of csv value
+            42738,
+            4045,
+            3822,
+        ]
+        expected_dict = dict(zip(ids, gene_lengths))
+        self.assertEqual(self.writer._get_mapping_dict_feature_length(ids), expected_dict)
+
+        # Bad
+        ids = ["NO_GENE"]
         with self.assertRaises(KeyError):
             self.writer._get_mapping_dict_feature_id(ids)