Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add tissue_type and new validation rules for tissue_ontology_term_id and cell_type_ontology_term_id #623

Merged
merged 8 commits into from
Sep 20, 2023
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 4.0.0-rc.0
current_version = 4.0.0-rc.1
commit = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(?:-(?P<prerel>rc)\.(?P<prerelversion>\d+))?
serialize =
Expand Down
2 changes: 1 addition & 1 deletion cellxgene_schema_cli/cellxgene_schema/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "4.0.0-rc.0"
__version__ = "4.0.0-rc.1"
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ components:
curie_constraints:
ontologies:
- CL
forbidden:
- CL:0000255
- CL:0000257
- CL:0000548
add_labels:
-
type: curie
Expand Down Expand Up @@ -163,15 +167,35 @@ components:
to_column: sex
tissue_ontology_term_id:
type: curie
curie_constraints:
ontologies:
- UBERON
- CL
suffixes:
UBERON:
- " (organoid)"
CL:
- " (cell culture)"
dependencies:
-
# If tissue_type is tissue OR organoid
rule: "tissue_type == 'tissue' | tissue_type == 'organoid'"
error_message_suffix: >-
When 'tissue_type' is 'tissue' or 'organoid',
'tissue_ontology_term_id' MUST be a child term id of 'UBERON:0001062' (anatomical entity).
type: curie
curie_constraints:
ontologies:
- UBERON
ancestors:
UBERON:
- UBERON:0001062
-
# If tissue_type is cell culture
rule: "tissue_type == 'cell culture'"
error_message_suffix: >-
When 'tissue_type' is 'cell culture', 'tissue_ontology_term_id' MUST be a CL term
and it can not be 'CL:0000255' (eukaryotic cell), 'CL:0000257' (Eumycetozoan cell),
nor 'CL:0000548' (animal cell).
type: curie
curie_constraints:
ontologies:
- CL
forbidden:
- CL:0000255
- CL:0000257
- CL:0000548
add_labels:
-
type: curie
Expand Down Expand Up @@ -527,4 +551,10 @@ components:
when 'assay_ontology_term_id' is EFO:0009919
enum:
- "cell"
- "nucleus"
- "nucleus"
tissue_type:
type: categorical
enum:
- "cell culture"
- "organoid"
- "tissue"
54 changes: 11 additions & 43 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,35 +43,6 @@ def __init__(self, ignore_labels=False):
# Matrix (e.g., X, raw.X, ...) number non-zero cache
self.number_non_zero = dict()

@staticmethod
def _curie_remove_suffix(term_id: str, suffix_def: dict) -> Tuple[str, str]:
"""
Remove suffix from a curie term id, if none present return it unmodified

:param str term_id: the curie term id to validate
:param dict{str: list[str], ...} suffix_def: dictionary whose keys are ontology term ids and values
are list of allowed suffixes

:rtype Tuple[str, str]
:return the term_id with suffixed stripped, and the suffix
"""

id_suffix = ""

for ontology_name, suffixes in suffix_def.items():
for suffix in suffixes:
suffix = suffix.replace("(", r"\(")
suffix = suffix.replace(")", r"\)")
search_results = re.search(r"%s$" % suffix, term_id)
if search_results:
stripped_term_id = re.sub(r"%s$" % suffix, "", term_id)
if ONTOLOGY_CHECKER.is_valid_term_id(ontology_name, stripped_term_id):
id_suffix = search_results.group(0)

return stripped_term_id, id_suffix

return term_id, id_suffix

def _validate_encoding_version(self):
import h5py

Expand Down Expand Up @@ -228,7 +199,7 @@ def _validate_curie(self, term_id: str, column_name: str, curie_constraints: dic

# If there are forbidden terms
if "forbidden" in curie_constraints and term_id in curie_constraints["forbidden"]:
self.errors.append(f"'{term_id}' in '{column_name}' is not allowed'.")
self.errors.append(f"'{term_id}' in '{column_name}' is not allowed.")
return

# If NA is found in allowed ontologies, it means only exceptions should be found. If no exceptions were found
Expand All @@ -237,10 +208,6 @@ def _validate_curie(self, term_id: str, column_name: str, curie_constraints: dic
self.errors.append(f"'{term_id}' in '{column_name}' is not a valid value of '{column_name}'.")
return

# Check if there are any allowed suffixes and remove them if needed
if "suffixes" in curie_constraints:
term_id, suffix = self._curie_remove_suffix(term_id, curie_constraints["suffixes"])

# Check that term id belongs to allowed ontologies
self._validate_curie_ontology(term_id, column_name, curie_constraints["ontologies"])

Expand Down Expand Up @@ -436,15 +403,9 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co
self.errors.append(f"Column '{column_name}' in dataframe '{df_name}' must not contain NaN values.")
return

if "curie_constraints" not in column_def:
raise ValueError(f"Corrupt schema definition, no 'curie_constraints' were found for '{column_name}'")
if "ontologies" not in column_def["curie_constraints"]:
raise ValueError(
f"allowed 'ontologies' must be specified under 'curie constraints' for '{column_name}'"
)

for term_id in column.drop_duplicates():
self._validate_curie(term_id, column_name, column_def["curie_constraints"])
if "curie_constraints" in column_def:
for term_id in column.drop_duplicates():
self._validate_curie(term_id, column_name, column_def["curie_constraints"])

# Add error suffix to errors found here
if "error_message_suffix" in column_def:
Expand Down Expand Up @@ -800,6 +761,13 @@ def _validate_seurat_convertibility(self):
)

self.is_seurat_convertible = False
if self.adata.raw and self.adata.raw.X.shape[1] != self.adata.raw.var.shape[0]:
nayib-jose-gloria marked this conversation as resolved.
Show resolved Hide resolved
self.warnings.append(
f"This dataset cannot be converted to the .rds (Seurat v4) format. "
f"There is a mismatch in the number of variables in the raw matrix and the raw var key-indexed "
f"variables."
)
self.is_seurat_convertible = False

def _validate_embedding_dict(self):
"""
Expand Down
24 changes: 7 additions & 17 deletions cellxgene_schema_cli/cellxgene_schema/write_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,31 +122,21 @@ def _get_mapping_dict_curie(self, ids: List[str], curie_constraints: dict) -> Di
mapping_dict = {}
allowed_ontologies = curie_constraints["ontologies"]

# Remove any suffixes if any
# original_ids will have untouched ids which will be used for mapping
# id_suffixes will save suffixes if any, these will be used to append to labels
# ids will have the ids without suffixes
original_ids = ids.copy()
id_suffixes = [""] * len(ids)

if "suffixes" in curie_constraints:
for i in range(len(ids)):
ids[i], id_suffixes[i] = Validator._curie_remove_suffix(ids[i], curie_constraints["suffixes"])

for original_id, id, id_suffix in zip(original_ids, ids, id_suffixes):
# Map term_ids to their human-readable ontology labels
for term_id in ids:
# If there are exceptions the label should be the same as the id
if "exceptions" in curie_constraints and original_id in curie_constraints["exceptions"]:
mapping_dict[original_id] = original_id
if "exceptions" in curie_constraints and term_id in curie_constraints["exceptions"]:
mapping_dict[term_id] = term_id
continue

for ontology_name in allowed_ontologies:
if ontology_name == "NA":
continue
if ONTOLOGY_CHECKER.is_valid_term_id(ontology_name, id):
mapping_dict[original_id] = ONTOLOGY_CHECKER.get_term_label(ontology_name, id) + id_suffix
if ONTOLOGY_CHECKER.is_valid_term_id(ontology_name, term_id):
mapping_dict[term_id] = ONTOLOGY_CHECKER.get_term_label(ontology_name, term_id)

# Check that all ids got a mapping. All ids should be found if adata was validated
for id in original_ids:
for id in ids:
if id not in mapping_dict:
raise ValueError(f"Add labels error: Unable to get label for '{id}'")

Expand Down
2 changes: 1 addition & 1 deletion cellxgene_schema_cli/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="cellxgene-schema",
version="4.0.0-rc.0",
version="4.0.0-rc.1",
url="https://github.com/chanzuckerberg/single-cell-curation",
license="MIT",
author="Chan Zuckerberg Initiative",
Expand Down
8 changes: 6 additions & 2 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"NCBITaxon:9606",
"PATO:0000383",
"UBERON:0002048",
"tissue",
True,
"HANCESTRO:0575",
"HsapDv:0000003",
Expand All @@ -53,7 +54,8 @@
"PATO:0000461",
"NCBITaxon:10090",
"unknown",
"CL:0000192 (cell culture)",
"CL:0000192",
"cell culture",
False,
"na",
"MmusDv:0000003",
Expand All @@ -69,6 +71,7 @@
"organism_ontology_term_id",
"sex_ontology_term_id",
"tissue_ontology_term_id",
"tissue_type",
"is_primary_data",
"self_reported_ethnicity_ontology_term_id",
"development_stage_ontology_term_id",
Expand All @@ -79,6 +82,7 @@

good_obs.loc[:, ["donor_id"]] = good_obs.astype("category")
good_obs.loc[:, ["suspension_type"]] = good_obs.astype("category")
good_obs.loc[:, ["tissue_type"]] = good_obs.astype("category")

# Expected obs, this is what the obs above should look like after adding the necessary columns with the validator,
# these columns are defined in the schema
Expand All @@ -100,7 +104,7 @@
"normal",
"Mus musculus",
"unknown",
"smooth muscle cell (cell culture)",
"smooth muscle cell",
"na",
"Theiler stage 01",
],
Expand Down
Binary file not shown.
Binary file modified cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad
Binary file not shown.
Loading