Skip to content

Commit

Permalink
feat: add observation_joinid to obs during add-labels (#629)
Browse files Browse the repository at this point in the history
* feat: add observation_joinid to obs during add-labels
  • Loading branch information
nayib-jose-gloria authored Sep 21, 2023
1 parent de1e00f commit 9b92401
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ components:
deprecated_columns:
- ethnicity
- ethnicity_ontology_term_id
reserved_columns:
- observation_joinid
columns:
cell_type_ontology_term_id:
type: curie
Expand Down
15 changes: 15 additions & 0 deletions cellxgene_schema_cli/cellxgene_schema/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging
import os
import sys
from base64 import b85encode
from typing import List, Union

import anndata as ad
import numpy as np
from scipy import sparse
from xxhash import xxh3_64_intdigest

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -146,3 +148,16 @@ def _enforce_canonical_format(df):
if adata.raw:
logger.info("enforce canonical format in raw.X")
_enforce_canonical_format(adata.raw)


def get_hash_digest_column(dataframe):
"""
Get column with hash digest for each row in dataframe.
"""

return (
dataframe.index.to_series()
.map(xxh3_64_intdigest)
.astype(np.uint64)
.apply(lambda v: b85encode(v.to_bytes(8, "big")).decode("ascii"))
)
9 changes: 5 additions & 4 deletions cellxgene_schema_cli/cellxgene_schema/write_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from cellxgene_schema.env import SCHEMA_REFERENCE_BASE_URL, SCHEMA_REFERENCE_FILE_NAME
from cellxgene_schema.validate import ONTOLOGY_CHECKER, Validator

from .utils import enforce_canonical_format, getattr_anndata
from .utils import enforce_canonical_format, get_hash_digest_column, getattr_anndata

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -350,16 +350,17 @@ def write_labels(self, add_labels_file: str):
:rtype None
"""
logger.info("Writing labels")
# Add labels in obs
# Add columns to dataset dataframes based on values in other columns, as defined in schema definition yaml
self._add_labels()

# Remove unused categories
self._remove_categories_with_zero_values()

# Set version
# Annotate Reserved Columns

self.adata.uns["schema_version"] = self.validator.schema_version
# Set schema reference URL
self.adata.uns["schema_reference"] = self._build_schema_reference_url(self.validator.schema_version)
self.adata.obs["observation_joinid"] = get_hash_digest_column(self.adata.obs)

enforce_canonical_format(self.adata)

Expand Down
1 change: 1 addition & 0 deletions cellxgene_schema_cli/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ pytest==7.2.2
PyYaml==6.0
wheel==0.40.0
semver==3.0.0
xxhash==3.3.0
3 changes: 3 additions & 0 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import anndata
import os
from scipy import sparse
from cellxgene_schema.utils import get_hash_digest_column

# -----------------------------------------------------------------#
# General example information
Expand Down Expand Up @@ -122,6 +123,8 @@
],
)

obs_expected["observation_joinid"] = get_hash_digest_column(obs_expected)

# ---
# 2. Creating individual var components: valid object and valid object and with labels

Expand Down
21 changes: 21 additions & 0 deletions cellxgene_schema_cli/tests/test_schema_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,27 @@ def test_column_presence_assay(self):
],
)

def test_obs_reserved_columns_presence(self):
"""
Reserved columns must NOT be used in obs
"""

for reserved_column in self.validator.schema_def["components"]["obs"]["reserved_columns"]:
with self.subTest(column=reserved_column):
# Resetting validator
self.validator.adata = examples.adata.copy()
self.validator.errors = []

self.validator.adata.obs[reserved_column] = "dummy_value"
self.validator.validate_adata()
self.assertEqual(
self.validator.errors,
[
f"ERROR: Column '{reserved_column}' is a reserved column name "
f"of 'obs'. Remove it from h5ad and try again."
],
)

def test_obsolete_term_id(self):
"""
Terms documented as obsolete in an ontology MUST NOT be used. For example, EFO:0009310
Expand Down
9 changes: 9 additions & 0 deletions cellxgene_schema_cli/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import numpy as np
import pandas as pd
import pytest
from anndata import AnnData
from cellxgene_schema.utils import (
enforce_canonical_format,
get_hash_digest_column,
map_ontology_term,
read_h5ad,
remove_deprecated_features,
Expand Down Expand Up @@ -110,6 +112,13 @@ def test_adata_with_canonical_X(self, adata_without_raw):
assert adata_without_raw.X.has_canonical_format is True


class TestGetHashDigestColumn:
def test_get_hash_digest_column(self, adata_with_raw):
hash_digest_column = get_hash_digest_column(adata_with_raw.obs)
expected_column = pd.Series(["ab6yl9v%fZ", "f-dZLjjiRl"], index=["X", "Y"])
pd.testing.assert_series_equal(hash_digest_column, expected_column)


class TestReadH5AD:
def test_read_h5ad(self):
h5ad_path = h5ad_valid
Expand Down

0 comments on commit 9b92401

Please sign in to comment.