chanzuckerberg · nayib-jose-gloria · Sep 20, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 15, 2023
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 4.0.0-rc.0
+current_version = 4.0.0-rc.1
 commit = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(?:-(?P<prerel>rc)\.(?P<prerelversion>\d+))?
 serialize = 

diff --git a/cellxgene_schema_cli/cellxgene_schema/__init__.py b/cellxgene_schema_cli/cellxgene_schema/__init__.py
@@ -1 +1 @@
-__version__ = "4.0.0-rc.0"
+__version__ = "4.0.0-rc.1"
diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml
@@ -103,6 +103,10 @@ components:
                 curie_constraints:
                     ontologies:
                         - CL
+                    forbidden:
+                        - CL:0000255
+                        - CL:0000257
+                        - CL:0000548
                 add_labels:
                     -
                         type: curie
@@ -163,15 +167,35 @@ components:
                         to_column: sex
             tissue_ontology_term_id:
                 type: curie
-                curie_constraints:
-                    ontologies:
-                        - UBERON
-                        - CL
-                    suffixes:
-                        UBERON:
-                            - " (organoid)"
-                        CL:
-                            - " (cell culture)"
+                dependencies:
+                    -
+                        # If tissue_type is tissue OR organoid
+                        rule: "tissue_type == 'tissue' | tissue_type == 'organoid'"
+                        error_message_suffix: >-
+                            When 'tissue_type' is 'tissue' or 'organoid',
+                            'tissue_ontology_term_id' MUST be a child term id of 'UBERON:0001062' (anatomical entity).
+                        type: curie
+                        curie_constraints:
+                            ontologies:
+                                - UBERON
+                            ancestors:
+                                UBERON:
+                                    - UBERON:0001062
+                    -
+                        # If tissue_type is cell culture
+                        rule: "tissue_type == 'cell culture'"
+                        error_message_suffix: >-
+                            When 'tissue_type' is 'cell culture', 'tissue_ontology_term_id' MUST be a CL term
+                            and it can not be 'CL:0000255' (eukaryotic cell), 'CL:0000257' (Eumycetozoan cell),
+                            nor 'CL:0000548' (animal cell).
+                        type: curie
+                        curie_constraints:
+                            ontologies:
+                                - CL
+                            forbidden:
+                                - CL:0000255
+                                - CL:0000257
+                                - CL:0000548
                 add_labels:
                     -
                         type: curie
@@ -527,4 +551,10 @@ components:
                             when 'assay_ontology_term_id' is EFO:0009919
                         enum:
                             - "cell"
-                            - "nucleus"
+                            - "nucleus"
+            tissue_type:
+                type: categorical
+                enum:
+                    - "cell culture"
+                    - "organoid"
+                    - "tissue"
diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py
@@ -43,35 +43,6 @@ def __init__(self, ignore_labels=False):
         # Matrix (e.g., X, raw.X, ...) number non-zero cache
         self.number_non_zero = dict()
 
-    @staticmethod
-    def _curie_remove_suffix(term_id: str, suffix_def: dict) -> Tuple[str, str]:
-        """
-        Remove suffix from a curie term id, if none present return it unmodified
-
-        :param str term_id: the curie term id to validate
-        :param dict{str: list[str], ...} suffix_def: dictionary whose keys are ontology term ids and values
-        are list of allowed suffixes
-
-        :rtype Tuple[str, str]
-        :return the term_id with suffixed stripped, and the suffix
-        """
-
-        id_suffix = ""
-
-        for ontology_name, suffixes in suffix_def.items():
-            for suffix in suffixes:
-                suffix = suffix.replace("(", r"\(")
-                suffix = suffix.replace(")", r"\)")
-                search_results = re.search(r"%s$" % suffix, term_id)
-                if search_results:
-                    stripped_term_id = re.sub(r"%s$" % suffix, "", term_id)
-                    if ONTOLOGY_CHECKER.is_valid_term_id(ontology_name, stripped_term_id):
-                        id_suffix = search_results.group(0)
-
-                        return stripped_term_id, id_suffix
-
-        return term_id, id_suffix
-
     def _validate_encoding_version(self):
         import h5py
 
@@ -228,7 +199,7 @@ def _validate_curie(self, term_id: str, column_name: str, curie_constraints: dic
 
         # If there are forbidden terms
         if "forbidden" in curie_constraints and term_id in curie_constraints["forbidden"]:
-            self.errors.append(f"'{term_id}' in '{column_name}' is not allowed'.")
+            self.errors.append(f"'{term_id}' in '{column_name}' is not allowed.")
             return
 
         # If NA is found in allowed ontologies, it means only exceptions should be found. If no exceptions were found
@@ -237,10 +208,6 @@ def _validate_curie(self, term_id: str, column_name: str, curie_constraints: dic
             self.errors.append(f"'{term_id}' in '{column_name}' is not a valid value of '{column_name}'.")
             return
 
-        # Check if there are any allowed suffixes and remove them if needed
-        if "suffixes" in curie_constraints:
-            term_id, suffix = self._curie_remove_suffix(term_id, curie_constraints["suffixes"])
-
         # Check that term id belongs to allowed ontologies
         self._validate_curie_ontology(term_id, column_name, curie_constraints["ontologies"])
 
@@ -436,15 +403,9 @@ def _validate_column(self, column: pd.Series, column_name: str, df_name: str, co
                 self.errors.append(f"Column '{column_name}' in dataframe '{df_name}' must not contain NaN values.")
                 return
 
-            if "curie_constraints" not in column_def:
-                raise ValueError(f"Corrupt schema definition, no 'curie_constraints' were found for '{column_name}'")
-            if "ontologies" not in column_def["curie_constraints"]:
-                raise ValueError(
-                    f"allowed 'ontologies' must be specified under 'curie constraints' for '{column_name}'"
-                )
-
-            for term_id in column.drop_duplicates():
-                self._validate_curie(term_id, column_name, column_def["curie_constraints"])
+            if "curie_constraints" in column_def:
+                for term_id in column.drop_duplicates():
+                    self._validate_curie(term_id, column_name, column_def["curie_constraints"])
 
         # Add error suffix to errors found here
         if "error_message_suffix" in column_def:
@@ -800,6 +761,13 @@ def _validate_seurat_convertibility(self):
                     )
 
                 self.is_seurat_convertible = False
+        if self.adata.raw and self.adata.raw.X.shape[1] != self.adata.raw.var.shape[0]:
+            self.warnings.append(
+                f"This dataset cannot be converted to the .rds (Seurat v4) format. "
+                f"There is a mismatch in the number of variables in the raw matrix and the raw var key-indexed "
+                f"variables."
+            )
+            self.is_seurat_convertible = False
 
     def _validate_embedding_dict(self):
         """

diff --git a/cellxgene_schema_cli/cellxgene_schema/write_labels.py b/cellxgene_schema_cli/cellxgene_schema/write_labels.py
@@ -122,31 +122,21 @@ def _get_mapping_dict_curie(self, ids: List[str], curie_constraints: dict) -> Di
         mapping_dict = {}
         allowed_ontologies = curie_constraints["ontologies"]
 
-        # Remove any suffixes if any
-        # original_ids will have untouched ids which will be used for mapping
-        # id_suffixes will save suffixes if any, these will be used to append to labels
-        # ids will have the ids without suffixes
-        original_ids = ids.copy()
-        id_suffixes = [""] * len(ids)
-
-        if "suffixes" in curie_constraints:
-            for i in range(len(ids)):
-                ids[i], id_suffixes[i] = Validator._curie_remove_suffix(ids[i], curie_constraints["suffixes"])
-
-        for original_id, id, id_suffix in zip(original_ids, ids, id_suffixes):
+        # Map term_ids to their human-readable ontology labels
+        for term_id in ids:
             # If there are exceptions the label should be the same as the id
-            if "exceptions" in curie_constraints and original_id in curie_constraints["exceptions"]:
-                mapping_dict[original_id] = original_id
+            if "exceptions" in curie_constraints and term_id in curie_constraints["exceptions"]:
+                mapping_dict[term_id] = term_id
                 continue
 
             for ontology_name in allowed_ontologies:
                 if ontology_name == "NA":
                     continue
-                if ONTOLOGY_CHECKER.is_valid_term_id(ontology_name, id):
-                    mapping_dict[original_id] = ONTOLOGY_CHECKER.get_term_label(ontology_name, id) + id_suffix
+                if ONTOLOGY_CHECKER.is_valid_term_id(ontology_name, term_id):
+                    mapping_dict[term_id] = ONTOLOGY_CHECKER.get_term_label(ontology_name, term_id)
 
         # Check that all ids got a mapping. All ids should be found if adata was validated
-        for id in original_ids:
+        for id in ids:
             if id not in mapping_dict:
                 raise ValueError(f"Add labels error: Unable to get label for '{id}'")
 

diff --git a/cellxgene_schema_cli/setup.py b/cellxgene_schema_cli/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="cellxgene-schema",
-    version="4.0.0-rc.0",
+    version="4.0.0-rc.1",
     url="https://github.com/chanzuckerberg/single-cell-curation",
     license="MIT",
     author="Chan Zuckerberg Initiative",

diff --git a/cellxgene_schema_cli/tests/fixtures/examples_validate.py b/cellxgene_schema_cli/tests/fixtures/examples_validate.py
@@ -41,6 +41,7 @@
             "NCBITaxon:9606",
             "PATO:0000383",
             "UBERON:0002048",
+            "tissue",
             True,
             "HANCESTRO:0575",
             "HsapDv:0000003",
@@ -53,7 +54,8 @@
             "PATO:0000461",
             "NCBITaxon:10090",
             "unknown",
-            "CL:0000192 (cell culture)",
+            "CL:0000192",
+            "cell culture",
             False,
             "na",
             "MmusDv:0000003",
@@ -69,6 +71,7 @@
         "organism_ontology_term_id",
         "sex_ontology_term_id",
         "tissue_ontology_term_id",
+        "tissue_type",
         "is_primary_data",
         "self_reported_ethnicity_ontology_term_id",
         "development_stage_ontology_term_id",
@@ -79,6 +82,7 @@
 
 good_obs.loc[:, ["donor_id"]] = good_obs.astype("category")
 good_obs.loc[:, ["suspension_type"]] = good_obs.astype("category")
+good_obs.loc[:, ["tissue_type"]] = good_obs.astype("category")
 
 # Expected obs, this is what the obs above should look like after adding the necessary columns with the validator,
 # these columns are defined in the schema
@@ -100,7 +104,7 @@
             "normal",
             "Mus musculus",
             "unknown",
-            "smooth muscle cell (cell culture)",
+            "smooth muscle cell",
             "na",
             "Theiler stage 01",
         ],

diff --git a/cellxgene_schema_cli/tests/fixtures/h5ads/example_invalid_CL.h5ad b/cellxgene_schema_cli/tests/fixtures/h5ads/example_invalid_CL.h5ad
diff --git a/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad b/cellxgene_schema_cli/tests/fixtures/h5ads/example_valid.h5ad
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "4.0.0-rc.0"
		__version__ = "4.0.0-rc.1"