Skip to content

Commit

Permalink
refactor _get_observation_joinid into util function for reuse
Browse files Browse the repository at this point in the history
  • Loading branch information
nayib-jose-gloria committed Sep 18, 2023
1 parent 7b9d998 commit 025bd0d
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 12 deletions.
15 changes: 15 additions & 0 deletions cellxgene_schema_cli/cellxgene_schema/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import logging
import os
import sys
from base64 import b85encode
from typing import List, Union

import anndata as ad
import numpy as np
from scipy import sparse
from xxhash import xxh3_64_intdigest

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -144,3 +146,16 @@ def _enforce_canonical_format(df):
if adata.raw:
logger.info("enforce canonical format in raw.X")
_enforce_canonical_format(adata.raw)


def get_hash_digest_column(dataframe):
"""
Get column with hash digest for each row in dataframe.
"""

return (
dataframe.index.to_series()
.map(xxh3_64_intdigest)
.astype(np.uint64)
.apply(lambda v: b85encode(v.to_bytes(8, "big")).decode("ascii"))
)
6 changes: 2 additions & 4 deletions cellxgene_schema_cli/cellxgene_schema/write_labels.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
import logging
import traceback
from base64 import b85encode
from typing import Dict, List, Optional

import numpy as np
import pandas as pd
from xxhash import xxh3_64_intdigest

from cellxgene_schema import ontology
from cellxgene_schema.env import SCHEMA_REFERENCE_BASE_URL, SCHEMA_REFERENCE_FILE_NAME
from cellxgene_schema.validate import ONTOLOGY_CHECKER, Validator

from .utils import enforce_canonical_format, getattr_anndata
from .utils import enforce_canonical_format, getattr_anndata, get_hash_digest_column

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -361,7 +359,7 @@ def write_labels(self, add_labels_file: str):

self.adata.uns["schema_version"] = self.validator.schema_version
self.adata.uns["schema_reference"] = self._build_schema_reference_url(self.validator.schema_version)
self.adata.obs["observation_joinid"] = self._get_observation_joinid_column()
self.adata.obs["observation_joinid"] = get_hash_digest_column(self.adata.obs)

enforce_canonical_format(self.adata)

Expand Down
10 changes: 2 additions & 8 deletions cellxgene_schema_cli/tests/fixtures/examples_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
import numpy
import anndata
import os
from base64 import b85encode
from scipy import sparse
from xxhash import xxh3_64_intdigest
from cellxgene_schema.utils import get_hash_digest_column

# -----------------------------------------------------------------#
# General example information
Expand Down Expand Up @@ -120,12 +119,7 @@
],
)

obs_expected["observation_joinid"] = (
obs_expected.index.to_series()
.map(xxh3_64_intdigest)
.astype(numpy.uint64)
.apply(lambda v: b85encode(v.to_bytes(8, "big")).decode("ascii"))
)
obs_expected["observation_joinid"] = get_hash_digest_column(obs_expected)

# ---
# 2. Creating individual var components: valid object and valid object and with labels
Expand Down

0 comments on commit 025bd0d

Please sign in to comment.