feat: add observation_joinid to obs during add-labels (#629)

* feat: add observation_joinid to obs during add-labels
chanzuckerberg · Sep 21, 2023 · 9b92401 · 9b92401
1 parent de1e00f
commit 9b92401
Show file tree

Hide file tree

Showing 7 changed files with 56 additions and 4 deletions.
diff --git a/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml b/cellxgene_schema_cli/cellxgene_schema/schema_definitions/schema_definition.yaml
@@ -102,6 +102,8 @@ components:
         deprecated_columns:
             - ethnicity
             - ethnicity_ontology_term_id
+        reserved_columns:
+            - observation_joinid
         columns:
             cell_type_ontology_term_id:
                 type: curie

diff --git a/cellxgene_schema_cli/cellxgene_schema/utils.py b/cellxgene_schema_cli/cellxgene_schema/utils.py
@@ -1,11 +1,13 @@
 import logging
 import os
 import sys
+from base64 import b85encode
 from typing import List, Union
 
 import anndata as ad
 import numpy as np
 from scipy import sparse
+from xxhash import xxh3_64_intdigest
 
 logger = logging.getLogger(__name__)
 
@@ -146,3 +148,16 @@ def _enforce_canonical_format(df):
     if adata.raw:
         logger.info("enforce canonical format in raw.X")
         _enforce_canonical_format(adata.raw)
+
+
+def get_hash_digest_column(dataframe):
+    """
+    Get column with hash digest for each row in dataframe.
+    """
+
+    return (
+        dataframe.index.to_series()
+        .map(xxh3_64_intdigest)
+        .astype(np.uint64)
+        .apply(lambda v: b85encode(v.to_bytes(8, "big")).decode("ascii"))
+    )
diff --git a/cellxgene_schema_cli/cellxgene_schema/write_labels.py b/cellxgene_schema_cli/cellxgene_schema/write_labels.py
@@ -8,7 +8,7 @@
 from cellxgene_schema.env import SCHEMA_REFERENCE_BASE_URL, SCHEMA_REFERENCE_FILE_NAME
 from cellxgene_schema.validate import ONTOLOGY_CHECKER, Validator
 
-from .utils import enforce_canonical_format, getattr_anndata
+from .utils import enforce_canonical_format, get_hash_digest_column, getattr_anndata
 
 logger = logging.getLogger(__name__)
 
@@ -350,16 +350,17 @@ def write_labels(self, add_labels_file: str):
         :rtype None
         """
         logger.info("Writing labels")
-        # Add labels in obs
+        # Add columns to dataset dataframes based on values in other columns, as defined in schema definition yaml
         self._add_labels()
 
         # Remove unused categories
         self._remove_categories_with_zero_values()
 
-        # Set version
+        # Annotate Reserved Columns
+
         self.adata.uns["schema_version"] = self.validator.schema_version
-        # Set schema reference URL
         self.adata.uns["schema_reference"] = self._build_schema_reference_url(self.validator.schema_version)
+        self.adata.obs["observation_joinid"] = get_hash_digest_column(self.adata.obs)
 
         enforce_canonical_format(self.adata)
 

diff --git a/cellxgene_schema_cli/requirements.txt b/cellxgene_schema_cli/requirements.txt
@@ -9,3 +9,4 @@ pytest==7.2.2
 PyYaml==6.0
 wheel==0.40.0
 semver==3.0.0
+xxhash==3.3.0
diff --git a/cellxgene_schema_cli/tests/fixtures/examples_validate.py b/cellxgene_schema_cli/tests/fixtures/examples_validate.py
@@ -4,6 +4,7 @@
 import anndata
 import os
 from scipy import sparse
+from cellxgene_schema.utils import get_hash_digest_column
 
 # -----------------------------------------------------------------#
 # General example information
@@ -122,6 +123,8 @@
     ],
 )
 
+obs_expected["observation_joinid"] = get_hash_digest_column(obs_expected)
+
 # ---
 # 2. Creating individual var components: valid object and valid object and with labels
 

diff --git a/cellxgene_schema_cli/tests/test_schema_compliance.py b/cellxgene_schema_cli/tests/test_schema_compliance.py
@@ -224,6 +224,27 @@ def test_column_presence_assay(self):
             ],
         )
 
+    def test_obs_reserved_columns_presence(self):
+        """
+        Reserved columns must NOT be used in obs
+        """
+
+        for reserved_column in self.validator.schema_def["components"]["obs"]["reserved_columns"]:
+            with self.subTest(column=reserved_column):
+                # Resetting validator
+                self.validator.adata = examples.adata.copy()
+                self.validator.errors = []
+
+                self.validator.adata.obs[reserved_column] = "dummy_value"
+                self.validator.validate_adata()
+                self.assertEqual(
+                    self.validator.errors,
+                    [
+                        f"ERROR: Column '{reserved_column}' is a reserved column name "
+                        f"of 'obs'. Remove it from h5ad and try again."
+                    ],
+                )
+
     def test_obsolete_term_id(self):
         """
         Terms documented as obsolete in an ontology MUST NOT be used. For example, EFO:0009310

diff --git a/cellxgene_schema_cli/tests/test_utils.py b/cellxgene_schema_cli/tests/test_utils.py
@@ -1,8 +1,10 @@
 import numpy as np
+import pandas as pd
 import pytest
 from anndata import AnnData
 from cellxgene_schema.utils import (
     enforce_canonical_format,
+    get_hash_digest_column,
     map_ontology_term,
     read_h5ad,
     remove_deprecated_features,
@@ -110,6 +112,13 @@ def test_adata_with_canonical_X(self, adata_without_raw):
         assert adata_without_raw.X.has_canonical_format is True
 
 
+class TestGetHashDigestColumn:
+    def test_get_hash_digest_column(self, adata_with_raw):
+        hash_digest_column = get_hash_digest_column(adata_with_raw.obs)
+        expected_column = pd.Series(["ab6yl9v%fZ", "f-dZLjjiRl"], index=["X", "Y"])
+        pd.testing.assert_series_equal(hash_digest_column, expected_column)
+
+
 class TestReadH5AD:
     def test_read_h5ad(self):
         h5ad_path = h5ad_valid