Skip to content

Commit

Permalink
feat: add tissue_type and new validation rules for tissue_ontology_te…
Browse files Browse the repository at this point in the history
…rm_id and cell_type_ontology_term_id (#623)

* feat: require tissue_type field and add new validation rules for tissue_ontology_term_id and cell_type_ontology_term_id

* add tests + simplify suffix write-labels + validation now that its not allowed
  • Loading branch information
nayib-jose-gloria authored Sep 20, 2023
1 parent aa7679f commit 6cef195
Show file tree
Hide file tree
Showing 11 changed files with 207 additions and 112 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 4.0.0-rc.0
current_version = 4.0.0-rc.1
commit = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(?:-(?P<prerel>rc)\.(?P<prerelversion>\d+))?
serialize =
Expand Down
2 changes: 1 addition & 1 deletion cellxgene_schema_cli/cellxgene_schema/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "4.0.0-rc.0"
__version__ = "4.0.0-rc.1"
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ components:
curie_constraints:
ontologies:
- CL
forbidden:
- CL:0000255
- CL:0000257
- CL:0000548
add_labels:
-
type: curie
Expand Down Expand Up @@ -168,15 +172,35 @@ components:
to_column: sex
tissue_ontology_term_id:
type: curie
curie_constraints:
ontologies:
- UBERON
- CL
suffixes:
UBERON:
- " (organoid)"
CL:
- " (cell culture)"
dependencies:
-
# If tissue_type is tissue OR organoid
rule: "tissue_type == 'tissue' | tissue_type == 'organoid'"
error_message_suffix: >-
When 'tissue_type' is 'tissue' or 'organoid',
'tissue_ontology_term_id' MUST be a child term id of 'UBERON:0001062' (anatomical entity).
type: curie
curie_constraints:
ontologies:
- UBERON
ancestors:
UBERON:
- UBERON:0001062
-
# If tissue_type is cell culture
rule: "tissue_type == 'cell culture'"
error_message_suffix: >-
When 'tissue_type' is 'cell culture', 'tissue_ontology_term_id' MUST be a CL term
and it can not be 'CL:0000255' (eukaryotic cell), 'CL:0000257' (Eumycetozoan cell),
nor 'CL:0000548' (animal cell).
type: curie
curie_constraints:
ontologies:
- CL
forbidden:
- CL:0000255
- CL:0000257
- CL:0000548
add_labels:
-
type: curie
Expand Down Expand Up @@ -532,4 +556,10 @@ components:
when 'assay_ontology_term_id' is EFO:0009919
enum:
- "cell"
- "nucleus"
- "nucleus"
tissue_type:
type: categorical
enum:
- "cell culture"
- "organoid"
- "tissue"
50 changes: 5 additions & 45 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import logging
import math
import os
import re
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Union
from typing import Dict, List, Optional, Union

import anndata
import numpy as np
Expand Down Expand Up @@ -43,35 +42,6 @@ def __init__(self, ignore_labels=False):
# Matrix (e.g., X, raw.X, ...) number non-zero cache
self.number_non_zero = dict()

@staticmethod
def _curie_remove_suffix(term_id: str, suffix_def: dict) -> Tuple[str, str]:
"""
Remove suffix from a curie term id, if none present return it unmodified
:param str term_id: the curie term id to validate
:param dict{str: list[str], ...} suffix_def: dictionary whose keys are ontology term ids and values
are list of allowed suffixes
:rtype Tuple[str, str]
:return the term_id with suffixed stripped, and the suffix
"""

id_suffix = ""

for ontology_name, suffixes in suffix_def.items():
for suffix in suffixes:
suffix = suffix.replace("(", r"\(")
suffix = suffix.replace(")", r"\)")
search_results = re.search(r"%s$" % suffix, term_id)
if search_results:
stripped_term_id = re.sub(r"%s$" % suffix, "", term_id)
if ONTOLOGY_CHECKER.is_valid_term_id(ontology_name, stripped_term_id):
id_suffix = search_results.group(0)

return stripped_term_id, id_suffix

return term_id, id_suffix

def _validate_encoding_version(self):
import h5py

Expand Down Expand Up @@ -228,7 +198,7 @@ def _validate_curie(self, term_id: str, column_name: str, curie_constraints: dic

# If there are forbidden terms
if "forbidden" in curie_constraints and term_id in curie_constraints["forbidden"]:
self.errors.append(f"'{term_id}' in '{column_name}' is not allowed'.")
self.errors.append(f"'{term_id}' in '{column_name}' is not allowed.")
return

# If NA is found in allowed ontologies, it means only exceptions should be found. If no exceptions were found
Expand All @@ -237,10 +207,6 @@ def _validate_curie(self, term_id: str, column_name: str, curie_constraints: dic
self.errors.append(f"'{term_id}' in '{column_name}' is not a valid value of '{column_name}'.")
return

# Check if there are any allowed suffixes and remove them if needed
if "suffixes" in curie_constraints:
term_id, suffix = self._curie_remove_suffix(term_id, curie_constraints["suffixes"])

# Check that term id belongs to allowed ontologies
self._validate_curie_ontology(term_id, column_name, curie_constraints["ontologies"])

Expand Down Expand Up @@ -436,15 +402,9 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co
self.errors.append(f"Column '{column_name}' in dataframe '{df_name}' must not contain NaN values.")
return

if "curie_constraints" not in column_def:
raise ValueError(f"Corrupt schema definition, no 'curie_constraints' were found for '{column_name}'")
if "ontologies" not in column_def["curie_constraints"]:
raise ValueError(
f"allowed 'ontologies' must be specified under 'curie constraints' for '{column_name}'"
)

for term_id in column.drop_duplicates():
self._validate_curie(term_id, column_name, column_def["curie_constraints"])
if "curie_constraints" in column_def:
for term_id in column.drop_duplicates():
self._validate_curie(term_id, column_name, column_def["curie_constraints"])

# Add error suffix to errors found here
if "error_message_suffix" in column_def:
Expand Down
24 changes: 7 additions & 17 deletions cellxgene_schema_cli/cellxgene_schema/write_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,31 +123,21 @@ def _get_mapping_dict_curie(self, ids: List[str], curie_constraints: dict) -> Di
mapping_dict = {}
allowed_ontologies = curie_constraints["ontologies"]

# Remove any suffixes if any
# original_ids will have untouched ids which will be used for mapping
# id_suffixes will save suffixes if any, these will be used to append to labels
# ids will have the ids without suffixes
original_ids = ids.copy()
id_suffixes = [""] * len(ids)

if "suffixes" in curie_constraints:
for i in range(len(ids)):
ids[i], id_suffixes[i] = Validator._curie_remove_suffix(ids[i], curie_constraints["suffixes"])

for original_id, id, id_suffix in zip(original_ids, ids, id_suffixes):
# Map term_ids to their human-readable ontology labels
for term_id in ids:
# If there are exceptions the label should be the same as the id
if "exceptions" in curie_constraints and original_id in curie_constraints["exceptions"]:
mapping_dict[original_id] = original_id
if "exceptions" in curie_constraints and term_id in curie_constraints["exceptions"]:
mapping_dict[term_id] = term_id
continue

for ontology_name in allowed_ontologies:
if ontology_name == "NA":
continue
if ONTOLOGY_CHECKER.is_valid_term_id(ontology_name, id):
mapping_dict[original_id] = ONTOLOGY_CHECKER.get_term_label(ontology_name, id) + id_suffix
if ONTOLOGY_CHECKER.is_valid_term_id(ontology_name, term_id):
mapping_dict[term_id] = ONTOLOGY_CHECKER.get_term_label(ontology_name, term_id)

# Check that all ids got a mapping. All ids should be found if adata was validated
for id in original_ids:
for id in ids:
if id not in mapping_dict:
raise ValueError(f"Add labels error: Unable to get label for '{id}'")

Expand Down
2 changes: 1 addition & 1 deletion cellxgene_schema_cli/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="cellxgene-schema",
version="4.0.0-rc.0",
version="4.0.0-rc.1",
url="https://github.com/chanzuckerberg/single-cell-curation",
license="MIT",
author="Chan Zuckerberg Initiative",
Expand Down
8 changes: 6 additions & 2 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"NCBITaxon:9606",
"PATO:0000383",
"UBERON:0002048",
"tissue",
True,
"HANCESTRO:0575",
"HsapDv:0000003",
Expand All @@ -53,7 +54,8 @@
"PATO:0000461",
"NCBITaxon:10090",
"unknown",
"CL:0000192 (cell culture)",
"CL:0000192",
"cell culture",
False,
"na",
"MmusDv:0000003",
Expand All @@ -69,6 +71,7 @@
"organism_ontology_term_id",
"sex_ontology_term_id",
"tissue_ontology_term_id",
"tissue_type",
"is_primary_data",
"self_reported_ethnicity_ontology_term_id",
"development_stage_ontology_term_id",
Expand All @@ -79,6 +82,7 @@

good_obs.loc[:, ["donor_id"]] = good_obs.astype("category")
good_obs.loc[:, ["suspension_type"]] = good_obs.astype("category")
good_obs.loc[:, ["tissue_type"]] = good_obs.astype("category")

# Expected obs, this is what the obs above should look like after adding the necessary columns with the validator,
# these columns are defined in the schema
Expand All @@ -100,7 +104,7 @@
"normal",
"Mus musculus",
"unknown",
"smooth muscle cell (cell culture)",
"smooth muscle cell",
"na",
"Theiler stage 01",
],
Expand Down
Binary file not shown.
Binary file modified cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad
Binary file not shown.
Loading

0 comments on commit 6cef195

Please sign in to comment.