feat: add feature length field, annotated by add-labels function (#624)

* feat: add feature length field, annotated by add-labels function
chanzuckerberg · Sep 19, 2023 · 1f482ae · 1f482ae
1 parent fd513fc
commit 1f482ae
Show file tree

Hide file tree

Showing 8 changed files with 122 additions and 40 deletions.
diff --git a/cellxgene_schema_cli/cellxgene_schema/ontology.py b/cellxgene_schema_cli/cellxgene_schema/ontology.py
@@ -66,8 +66,9 @@ def __init__(self, species: SupportedOrganisms):
                 gene = gene.rstrip().split(",")
                 gene_id = gene[0]
                 gene_label = gene[1]
+                gene_length = int(gene[3])
 
-                self.gene_dict[gene_id] = gene_label
+                self.gene_dict[gene_id] = (gene_label, gene_length)
 
                 # Keeps track of duplicated gene labels
                 if gene_label in gene_labels:
@@ -76,9 +77,9 @@ def __init__(self, species: SupportedOrganisms):
                     gene_labels.add(gene_label)
 
             # Makes gene labels unique
-            for gene_id, gene_label in self.gene_dict.items():
+            for gene_id, (gene_label, gene_length) in self.gene_dict.items():
                 if gene_label in duplicated_gene_labels:
-                    self.gene_dict[gene_id] = gene_label + "_" + gene_id
+                    self.gene_dict[gene_id] = (gene_label + "_" + gene_id, gene_length)
 
     def is_valid_id(self, gene_id: str) -> bool:
         """
@@ -92,7 +93,7 @@ def is_valid_id(self, gene_id: str) -> bool:
 
         return gene_id in self.gene_dict
 
-    def get_symbol(self, gene_id) -> str:
+    def get_symbol(self, gene_id: str) -> str:
         """
         Gets symbol associated to the ENSEBML id
 
@@ -102,10 +103,25 @@ def get_symbol(self, gene_id) -> str:
         :return A gene symbol
         """
 
-        if not self.is_valid_id(gene_id):
+        if self.is_valid_id(gene_id):
+            return self.gene_dict[gene_id][0]
+        else:
             raise ValueError(f"The id '{gene_id}' is not a valid ENSEMBL id for '{self.species}'")
 
-        return self.gene_dict[gene_id]
+    def get_length(self, gene_id: str) -> int:
+        """
+        Gets feature length associated to the ENSEBML id
+
+        :param str gene_id: ENSEMBL gene id
+
+        :rtype int
+        :return A gene length
+        """
+
+        if self.is_valid_id(gene_id):
+            return self.gene_dict[gene_id][1]
+        else:
+            raise ValueError(f"The id '{gene_id}' is not a valid ENSEMBL id for '{self.species}'")
 
 
 class OntologyChecker:

diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml
@@ -57,7 +57,6 @@ components:
         index:
             unique: true
             type: feature_id
-            # Using IDs add two columns: feature_id, and feature_reference
             add_labels:
                 -
                     type: feature_id
@@ -68,6 +67,9 @@ components:
                 -
                     type: feature_biotype
                     to_column: feature_biotype
+                -
+                    type: feature_length
+                    to_column: feature_length
         # All columns are required
         columns:
             feature_is_filtered:
@@ -79,7 +81,6 @@ components:
         index:
             unique: true
             type: feature_id
-            # Using IDs add two columns: feature_id, and feature_reference
             add_labels:
                 -
                     type: feature_id
@@ -90,6 +91,9 @@ components:
                 -
                     type: feature_biotype
                     to_column: feature_biotype
+                -
+                    type: feature_length
+                    to_column: feature_length
     obs:
         type: dataframe
         required: null # Means it's required

diff --git a/cellxgene_schema_cli/cellxgene_schema/write_labels.py b/cellxgene_schema_cli/cellxgene_schema/write_labels.py
@@ -210,6 +210,27 @@ def _get_mapping_dict_feature_biotype(self, ids: List[str]) -> Dict[str, str]:
 
         return mapping_dict
 
+    def _get_mapping_dict_feature_length(self, ids: List[str]) -> Dict[str, int]:
+        """
+        Creates a mapping dictionary of feature IDs and feature length, fetching from pre-calculated gene info CSVs
+        derived from GENCODE mappings for supported organisms. Set to 0 for non-gene features.
+
+        :param list[str] ids: feature IDs use for mapping
+
+        :return a mapping dictionary: {id: <int>, id: 0, ...}
+        :rtype dict
+        """
+        mapping_dict = {}
+
+        for i in ids:
+            if i.startswith("ENS"):
+                organism = ontology.get_organism_from_feature_id(i)
+                mapping_dict[i] = self.validator.gene_checkers[organism].get_length(i)
+            else:
+                mapping_dict[i] = 0
+
+        return mapping_dict
+
     def _get_labels(
         self,
         component: str,
@@ -263,6 +284,9 @@ def _get_labels(
         elif label_type == "feature_biotype":
             mapping_dict = self._get_mapping_dict_feature_biotype(ids=ids)
 
+        elif label_type == "feature_length":
+            mapping_dict = self._get_mapping_dict_feature_length(ids=ids)
+
         else:
             raise TypeError(f"'{label_type}' is not supported in 'add-labels' functionality")
 
@@ -287,9 +311,9 @@ def _add_column(self, component: str, column: str, column_definition: dict):
             new_column = self._get_labels(component, column, column_definition, label_def["type"])
             new_column_name = label_def["to_column"]
 
-            # The sintax below is a programtic way to access obs and var in adata:
+            # The syntax below is a programmatic way to access obs and var in adata:
             # adata.__dict__["_obs"] is adata.obs
-            # "raw.var" requires to levels of programtic access
+            # "raw.var" requires to levels of programmatic access
             if "." in component:
                 [first_elem, second_elem] = component.split(".")
                 self.adata.__dict__["_" + first_elem].__dict__["_" + second_elem][new_column_name] = new_column
@@ -298,8 +322,7 @@ def _add_column(self, component: str, column: str, column_definition: dict):
 
     def _add_labels(self):
         """
-        From a valid (per cellxgene's schema) adata, this function adds to self.adata ontology/gene labels
-        to adata.obs, adata.var, and adata.raw.var respectively
+        Add columns to dataset dataframes based on values in other columns, as defined in schema definition yaml.
         """
         for component in ["obs", "var", "raw.var"]:
             # If the component does not exist, skip (this is for raw.var)

diff --git a/cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py b/cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py
@@ -4,13 +4,13 @@
 invalid_species = ["Caenorhabditis elegans"]
 
 valid_genes = {
-    ontology.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": "TP53"},
-    ontology.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": "Trp53"},
+    ontology.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": ("TP53", 5676)},
+    ontology.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": ("Trp53", 4045)},
 }
 
 invalid_genes = {
-    ontology.SupportedOrganisms.HOMO_SAPIENS: ["ENSMUSG00000059552", "GENE"],
-    ontology.SupportedOrganisms.MUS_MUSCULUS: ["ENSG00000141510", "GENE"],
+    ontology.SupportedOrganisms.HOMO_SAPIENS: ["ENSMUSG00000059552", ("GENE", 1000)],
+    ontology.SupportedOrganisms.MUS_MUSCULUS: ["ENSG00000141510", ("GENE", 200)],
 }
 
 # For ontology checker

diff --git a/cellxgene_schema_cli/tests/fixtures/examples_validate.py b/cellxgene_schema_cli/tests/fixtures/examples_validate.py
@@ -137,17 +137,18 @@
 # these columns are defined in the schema
 var_expected = pd.DataFrame(
     [
-        ["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630"],
-        ["gene", False, "MACF1", "NCBITaxon:9606"],
-        ["gene", False, "Trp53", "NCBITaxon:10090"],
-        ["gene", False, "S", "NCBITaxon:2697049"],
+        ["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630", 0],
+        ["gene", False, "MACF1", "NCBITaxon:9606", 42738],
+        ["gene", False, "Trp53", "NCBITaxon:10090", 4045],
+        ["gene", False, "S", "NCBITaxon:2697049", 3822],
     ],
     index=["ERCC-00002", "ENSG00000127603", "ENSMUSG00000059552", "ENSSASG00005000004"],
     columns=[
         "feature_biotype",
         "feature_is_filtered",
         "feature_name",
         "feature_reference",
+        "feature_length",
     ],
 )
 

diff --git a/cellxgene_schema_cli/tests/test_ontology.py b/cellxgene_schema_cli/tests/test_ontology.py
@@ -32,10 +32,12 @@ def test_valid_genes(self):
         for species in self.valid_genes:
             geneChecker = ontology.GeneChecker(species)
             for gene_id in self.valid_genes[species]:
-                gene_label = self.valid_genes[species][gene_id]
+                gene_label = self.valid_genes[species][gene_id][0]
+                gene_length = self.valid_genes[species][gene_id][1]
 
                 self.assertTrue(geneChecker.is_valid_id(gene_id))
                 self.assertEqual(geneChecker.get_symbol(gene_id), gene_label)
+                self.assertEqual(geneChecker.get_length(gene_id), gene_length)
 
     def test_invalid_genes(self):
         for species in self.invalid_genes:
@@ -44,6 +46,8 @@ def test_invalid_genes(self):
                 self.assertFalse(geneChecker.is_valid_id(gene_id))
                 with self.assertRaises(ValueError):
                     geneChecker.get_symbol(gene_id)
+                with self.assertRaises(ValueError):
+                    geneChecker.get_length(gene_id)
 
 
 class TestOntologyChecker(unittest.TestCase):

diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py
@@ -1010,6 +1010,29 @@ def test_should_warn_for_low_gene_count(self):
             ["WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix."],
         )
 
+    def test_add_label_fields_are_reserved(self):
+        """
+        Raise an error if column names flagged as 'add_label' -> 'to_column' in the schema definition are not available.
+        """
+        for df in ["var", "raw.var"]:
+            for i in self.validator.schema_def["components"][df]["index"]["add_labels"]:
+                column = i["to_column"]
+                with self.subTest(column=column, df=df):
+                    # Resetting validator
+                    self.validator.adata = examples.adata.copy()
+                    self.validator.errors = []
+
+                    component = getattr_anndata(self.validator.adata, df)
+                    component[column] = "dummy_value"
+                    self.validator.validate_adata()
+                    self.assertEqual(
+                        self.validator.errors,
+                        [
+                            f"ERROR: Add labels error: Column '{column}' is a reserved column name "
+                            f"of '{df}'. Remove it from h5ad and try again."
+                        ],
+                    )
+
 
 class TestUns(BaseValidationTest):
     """
@@ -1279,32 +1302,22 @@ def setUpClass(cls):
         cls.adata_with_labels = examples.adata_with_labels
 
         # Validate test data
-        validator = Validator()
-        validator.adata = examples.adata.copy()
-        validator.validate_adata()
+        cls.validator = Validator()
+        cls.validator.adata = examples.adata.copy()
+        cls.validator.validate_adata()
 
         # Add labels through validator
-        cls.label_writer = AnnDataLabelAppender(validator)
+        cls.label_writer = AnnDataLabelAppender(cls.validator)
         cls.label_writer._add_labels()
 
     def test_var_added_labels(self):
         """
         When a dataset is uploaded, cellxgene Data Portal MUST automatically add the matching human-readable
         name for the corresponding feature identifier and the inferred NCBITaxon term for the reference organism
-        to the var dataframe. Curators MUST NOT annotate the following columns:
-
-            - feature_name. this MUST be a human-readable ENSEMBL gene name or a ERCC Spike-In identifier
-            appended with " spike-in control", corresponding to the feature_id
-            - feature_reference. This MUST be the reference organism for a feature:
-                Homo sapiens	"NCBITaxon:9606"
-                Mus musculus	"NCBITaxon:10090"
-                SARS-CoV-2	"NCBITaxon:2697049"
-                ERCC Spike-Ins	"NCBITaxon:32630"
-            - feature_biotype. This MUST be "gene" if the feature_id is an ENSEMBL gene, or "spike-in" if the feature_id
-            is an ERCC Spike-In identifier.
+        to the var dataframe. Curators MUST NOT annotate the columns below:
         """
-
-        for column in ["feature_name", "feature_reference", "feature_biotype"]:
+        for i in self.validator.schema_def["components"]["var"]["index"]["add_labels"]:
+            column = i["to_column"]
             expected_column = self.adata_with_labels.var[column]
             obtained_column = self.label_writer.adata.var[column]
 

diff --git a/cellxgene_schema_cli/tests/test_validate.py b/cellxgene_schema_cli/tests/test_validate.py
@@ -113,7 +113,6 @@ def test_get_dictionary_mapping_feature_id(self):
 
         # Bad
         ids = ["NO_GENE"]
-        expected_dict = dict(zip(ids, labels))
         with self.assertRaises(KeyError):
             self.writer._get_mapping_dict_feature_id(ids)
 
@@ -136,7 +135,29 @@ def test_get_dictionary_mapping_feature_reference(self):
 
         # Bad
         ids = ["NO_GENE"]
-        expected_dict = dict(zip(ids, labels))
+        with self.assertRaises(KeyError):
+            self.writer._get_mapping_dict_feature_id(ids)
+
+    def test_get_dictionary_mapping_feature_length(self):
+        # Good
+        ids = [
+            "ERCC-00002",
+            "ENSG00000127603",
+            "ENSMUSG00000059552",
+            "ENSSASG00005000004",
+        ]
+        # values derived from csv
+        gene_lengths = [
+            0,  # non-gene feature, so set to 0 regardless of csv value
+            42738,
+            4045,
+            3822,
+        ]
+        expected_dict = dict(zip(ids, gene_lengths))
+        self.assertEqual(self.writer._get_mapping_dict_feature_length(ids), expected_dict)
+
+        # Bad
+        ids = ["NO_GENE"]
         with self.assertRaises(KeyError):
             self.writer._get_mapping_dict_feature_id(ids)