Skip to content

Commit

Permalink
feat: add feature length field, annotated by add-labels function (#624)
Browse files Browse the repository at this point in the history
* feat: add feature length field, annotated by add-labels function
  • Loading branch information
nayib-jose-gloria authored Sep 19, 2023
1 parent fd513fc commit 1f482ae
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 40 deletions.
28 changes: 22 additions & 6 deletions cellxgene_schema_cli/cellxgene_schema/ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ def __init__(self, species: SupportedOrganisms):
gene = gene.rstrip().split(",")
gene_id = gene[0]
gene_label = gene[1]
gene_length = int(gene[3])

self.gene_dict[gene_id] = gene_label
self.gene_dict[gene_id] = (gene_label, gene_length)

# Keeps track of duplicated gene labels
if gene_label in gene_labels:
Expand All @@ -76,9 +77,9 @@ def __init__(self, species: SupportedOrganisms):
gene_labels.add(gene_label)

# Makes gene labels unique
for gene_id, gene_label in self.gene_dict.items():
for gene_id, (gene_label, gene_length) in self.gene_dict.items():
if gene_label in duplicated_gene_labels:
self.gene_dict[gene_id] = gene_label + "_" + gene_id
self.gene_dict[gene_id] = (gene_label + "_" + gene_id, gene_length)

def is_valid_id(self, gene_id: str) -> bool:
"""
Expand All @@ -92,7 +93,7 @@ def is_valid_id(self, gene_id: str) -> bool:

return gene_id in self.gene_dict

def get_symbol(self, gene_id) -> str:
def get_symbol(self, gene_id: str) -> str:
"""
Gets symbol associated to the ENSEBML id
Expand All @@ -102,10 +103,25 @@ def get_symbol(self, gene_id) -> str:
:return A gene symbol
"""

if not self.is_valid_id(gene_id):
if self.is_valid_id(gene_id):
return self.gene_dict[gene_id][0]
else:
raise ValueError(f"The id '{gene_id}' is not a valid ENSEMBL id for '{self.species}'")

return self.gene_dict[gene_id]
def get_length(self, gene_id: str) -> int:
"""
Gets feature length associated to the ENSEBML id
:param str gene_id: ENSEMBL gene id
:rtype int
:return A gene length
"""

if self.is_valid_id(gene_id):
return self.gene_dict[gene_id][1]
else:
raise ValueError(f"The id '{gene_id}' is not a valid ENSEMBL id for '{self.species}'")


class OntologyChecker:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ components:
index:
unique: true
type: feature_id
# Using IDs add two columns: feature_id, and feature_reference
add_labels:
-
type: feature_id
Expand All @@ -68,6 +67,9 @@ components:
-
type: feature_biotype
to_column: feature_biotype
-
type: feature_length
to_column: feature_length
# All columns are required
columns:
feature_is_filtered:
Expand All @@ -79,7 +81,6 @@ components:
index:
unique: true
type: feature_id
# Using IDs add two columns: feature_id, and feature_reference
add_labels:
-
type: feature_id
Expand All @@ -90,6 +91,9 @@ components:
-
type: feature_biotype
to_column: feature_biotype
-
type: feature_length
to_column: feature_length
obs:
type: dataframe
required: null # Means it's required
Expand Down
31 changes: 27 additions & 4 deletions cellxgene_schema_cli/cellxgene_schema/write_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,27 @@ def _get_mapping_dict_feature_biotype(self, ids: List[str]) -> Dict[str, str]:

return mapping_dict

def _get_mapping_dict_feature_length(self, ids: List[str]) -> Dict[str, int]:
"""
Creates a mapping dictionary of feature IDs and feature length, fetching from pre-calculated gene info CSVs
derived from GENCODE mappings for supported organisms. Set to 0 for non-gene features.
:param list[str] ids: feature IDs use for mapping
:return a mapping dictionary: {id: <int>, id: 0, ...}
:rtype dict
"""
mapping_dict = {}

for i in ids:
if i.startswith("ENS"):
organism = ontology.get_organism_from_feature_id(i)
mapping_dict[i] = self.validator.gene_checkers[organism].get_length(i)
else:
mapping_dict[i] = 0

return mapping_dict

def _get_labels(
self,
component: str,
Expand Down Expand Up @@ -263,6 +284,9 @@ def _get_labels(
elif label_type == "feature_biotype":
mapping_dict = self._get_mapping_dict_feature_biotype(ids=ids)

elif label_type == "feature_length":
mapping_dict = self._get_mapping_dict_feature_length(ids=ids)

else:
raise TypeError(f"'{label_type}' is not supported in 'add-labels' functionality")

Expand All @@ -287,9 +311,9 @@ def _add_column(self, component: str, column: str, column_definition: dict):
new_column = self._get_labels(component, column, column_definition, label_def["type"])
new_column_name = label_def["to_column"]

# The sintax below is a programtic way to access obs and var in adata:
# The syntax below is a programmatic way to access obs and var in adata:
# adata.__dict__["_obs"] is adata.obs
# "raw.var" requires to levels of programtic access
# "raw.var" requires to levels of programmatic access
if "." in component:
[first_elem, second_elem] = component.split(".")
self.adata.__dict__["_" + first_elem].__dict__["_" + second_elem][new_column_name] = new_column
Expand All @@ -298,8 +322,7 @@ def _add_column(self, component: str, column: str, column_definition: dict):

def _add_labels(self):
"""
From a valid (per cellxgene's schema) adata, this function adds to self.adata ontology/gene labels
to adata.obs, adata.var, and adata.raw.var respectively
Add columns to dataset dataframes based on values in other columns, as defined in schema definition yaml.
"""
for component in ["obs", "var", "raw.var"]:
# If the component does not exist, skip (this is for raw.var)
Expand Down
8 changes: 4 additions & 4 deletions cellxgene_schema_cli/tests/fixtures/examples_ontology_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
invalid_species = ["Caenorhabditis elegans"]

valid_genes = {
ontology.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": "TP53"},
ontology.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": "Trp53"},
ontology.SupportedOrganisms.HOMO_SAPIENS: {"ENSG00000141510": ("TP53", 5676)},
ontology.SupportedOrganisms.MUS_MUSCULUS: {"ENSMUSG00000059552": ("Trp53", 4045)},
}

invalid_genes = {
ontology.SupportedOrganisms.HOMO_SAPIENS: ["ENSMUSG00000059552", "GENE"],
ontology.SupportedOrganisms.MUS_MUSCULUS: ["ENSG00000141510", "GENE"],
ontology.SupportedOrganisms.HOMO_SAPIENS: ["ENSMUSG00000059552", ("GENE", 1000)],
ontology.SupportedOrganisms.MUS_MUSCULUS: ["ENSG00000141510", ("GENE", 200)],
}

# For ontology checker
Expand Down
9 changes: 5 additions & 4 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,17 +137,18 @@
# these columns are defined in the schema
var_expected = pd.DataFrame(
[
["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630"],
["gene", False, "MACF1", "NCBITaxon:9606"],
["gene", False, "Trp53", "NCBITaxon:10090"],
["gene", False, "S", "NCBITaxon:2697049"],
["spike-in", False, "ERCC-00002 (spike-in control)", "NCBITaxon:32630", 0],
["gene", False, "MACF1", "NCBITaxon:9606", 42738],
["gene", False, "Trp53", "NCBITaxon:10090", 4045],
["gene", False, "S", "NCBITaxon:2697049", 3822],
],
index=["ERCC-00002", "ENSG00000127603", "ENSMUSG00000059552", "ENSSASG00005000004"],
columns=[
"feature_biotype",
"feature_is_filtered",
"feature_name",
"feature_reference",
"feature_length",
],
)

Expand Down
6 changes: 5 additions & 1 deletion cellxgene_schema_cli/tests/test_ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@ def test_valid_genes(self):
for species in self.valid_genes:
geneChecker = ontology.GeneChecker(species)
for gene_id in self.valid_genes[species]:
gene_label = self.valid_genes[species][gene_id]
gene_label = self.valid_genes[species][gene_id][0]
gene_length = self.valid_genes[species][gene_id][1]

self.assertTrue(geneChecker.is_valid_id(gene_id))
self.assertEqual(geneChecker.get_symbol(gene_id), gene_label)
self.assertEqual(geneChecker.get_length(gene_id), gene_length)

def test_invalid_genes(self):
for species in self.invalid_genes:
Expand All @@ -44,6 +46,8 @@ def test_invalid_genes(self):
self.assertFalse(geneChecker.is_valid_id(gene_id))
with self.assertRaises(ValueError):
geneChecker.get_symbol(gene_id)
with self.assertRaises(ValueError):
geneChecker.get_length(gene_id)


class TestOntologyChecker(unittest.TestCase):
Expand Down
47 changes: 30 additions & 17 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,29 @@ def test_should_warn_for_low_gene_count(self):
["WARNING: Dataframe 'var' only has 4 rows. Features SHOULD NOT be filtered from expression matrix."],
)

def test_add_label_fields_are_reserved(self):
"""
Raise an error if column names flagged as 'add_label' -> 'to_column' in the schema definition are not available.
"""
for df in ["var", "raw.var"]:
for i in self.validator.schema_def["components"][df]["index"]["add_labels"]:
column = i["to_column"]
with self.subTest(column=column, df=df):
# Resetting validator
self.validator.adata = examples.adata.copy()
self.validator.errors = []

component = getattr_anndata(self.validator.adata, df)
component[column] = "dummy_value"
self.validator.validate_adata()
self.assertEqual(
self.validator.errors,
[
f"ERROR: Add labels error: Column '{column}' is a reserved column name "
f"of '{df}'. Remove it from h5ad and try again."
],
)


class TestUns(BaseValidationTest):
"""
Expand Down Expand Up @@ -1279,32 +1302,22 @@ def setUpClass(cls):
cls.adata_with_labels = examples.adata_with_labels

# Validate test data
validator = Validator()
validator.adata = examples.adata.copy()
validator.validate_adata()
cls.validator = Validator()
cls.validator.adata = examples.adata.copy()
cls.validator.validate_adata()

# Add labels through validator
cls.label_writer = AnnDataLabelAppender(validator)
cls.label_writer = AnnDataLabelAppender(cls.validator)
cls.label_writer._add_labels()

def test_var_added_labels(self):
"""
When a dataset is uploaded, cellxgene Data Portal MUST automatically add the matching human-readable
name for the corresponding feature identifier and the inferred NCBITaxon term for the reference organism
to the var dataframe. Curators MUST NOT annotate the following columns:
- feature_name. this MUST be a human-readable ENSEMBL gene name or a ERCC Spike-In identifier
appended with " spike-in control", corresponding to the feature_id
- feature_reference. This MUST be the reference organism for a feature:
Homo sapiens "NCBITaxon:9606"
Mus musculus "NCBITaxon:10090"
SARS-CoV-2 "NCBITaxon:2697049"
ERCC Spike-Ins "NCBITaxon:32630"
- feature_biotype. This MUST be "gene" if the feature_id is an ENSEMBL gene, or "spike-in" if the feature_id
is an ERCC Spike-In identifier.
to the var dataframe. Curators MUST NOT annotate the columns below:
"""

for column in ["feature_name", "feature_reference", "feature_biotype"]:
for i in self.validator.schema_def["components"]["var"]["index"]["add_labels"]:
column = i["to_column"]
expected_column = self.adata_with_labels.var[column]
obtained_column = self.label_writer.adata.var[column]

Expand Down
25 changes: 23 additions & 2 deletions cellxgene_schema_cli/tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def test_get_dictionary_mapping_feature_id(self):

# Bad
ids = ["NO_GENE"]
expected_dict = dict(zip(ids, labels))
with self.assertRaises(KeyError):
self.writer._get_mapping_dict_feature_id(ids)

Expand All @@ -136,7 +135,29 @@ def test_get_dictionary_mapping_feature_reference(self):

# Bad
ids = ["NO_GENE"]
expected_dict = dict(zip(ids, labels))
with self.assertRaises(KeyError):
self.writer._get_mapping_dict_feature_id(ids)

def test_get_dictionary_mapping_feature_length(self):
# Good
ids = [
"ERCC-00002",
"ENSG00000127603",
"ENSMUSG00000059552",
"ENSSASG00005000004",
]
# values derived from csv
gene_lengths = [
0, # non-gene feature, so set to 0 regardless of csv value
42738,
4045,
3822,
]
expected_dict = dict(zip(ids, gene_lengths))
self.assertEqual(self.writer._get_mapping_dict_feature_length(ids), expected_dict)

# Bad
ids = ["NO_GENE"]
with self.assertRaises(KeyError):
self.writer._get_mapping_dict_feature_id(ids)

Expand Down

0 comments on commit 1f482ae

Please sign in to comment.