From 025bd0d60c1de2ac75d648bcdec7d8ab8b65a624 Mon Sep 17 00:00:00 2001 From: nayib-jose-gloria Date: Mon, 18 Sep 2023 17:17:05 -0400 Subject: [PATCH] refactor _get_observation_joinid into util function for reuse --- cellxgene_schema_cli/cellxgene_schema/utils.py | 15 +++++++++++++++ .../cellxgene_schema/write_labels.py | 6 ++---- .../tests/fixtures/examples_validate.py | 10 ++-------- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/cellxgene_schema_cli/cellxgene_schema/utils.py b/cellxgene_schema_cli/cellxgene_schema/utils.py index a552ef57f..1b54450c1 100644 --- a/cellxgene_schema_cli/cellxgene_schema/utils.py +++ b/cellxgene_schema_cli/cellxgene_schema/utils.py @@ -1,11 +1,13 @@ import logging import os import sys +from base64 import b85encode from typing import List, Union import anndata as ad import numpy as np from scipy import sparse +from xxhash import xxh3_64_intdigest logger = logging.getLogger(__name__) @@ -144,3 +146,16 @@ def _enforce_canonical_format(df): if adata.raw: logger.info("enforce canonical format in raw.X") _enforce_canonical_format(adata.raw) + + +def get_hash_digest_column(dataframe): + """ + Get column with hash digest for each row in dataframe. + """ + + return ( + dataframe.index.to_series() + .map(xxh3_64_intdigest) + .astype(np.uint64) + .apply(lambda v: b85encode(v.to_bytes(8, "big")).decode("ascii")) + ) diff --git a/cellxgene_schema_cli/cellxgene_schema/write_labels.py b/cellxgene_schema_cli/cellxgene_schema/write_labels.py index 23a9a6732..6d87ba3dc 100644 --- a/cellxgene_schema_cli/cellxgene_schema/write_labels.py +++ b/cellxgene_schema_cli/cellxgene_schema/write_labels.py @@ -1,17 +1,15 @@ import logging import traceback -from base64 import b85encode from typing import Dict, List, Optional import numpy as np import pandas as pd -from xxhash import xxh3_64_intdigest from cellxgene_schema import ontology from cellxgene_schema.env import SCHEMA_REFERENCE_BASE_URL, SCHEMA_REFERENCE_FILE_NAME from cellxgene_schema.validate import ONTOLOGY_CHECKER, Validator -from .utils import enforce_canonical_format, getattr_anndata +from .utils import enforce_canonical_format, getattr_anndata, get_hash_digest_column logger = logging.getLogger(__name__) @@ -361,7 +359,7 @@ def write_labels(self, add_labels_file: str): self.adata.uns["schema_version"] = self.validator.schema_version self.adata.uns["schema_reference"] = self._build_schema_reference_url(self.validator.schema_version) - self.adata.obs["observation_joinid"] = self._get_observation_joinid_column() + self.adata.obs["observation_joinid"] = get_hash_digest_column(self.adata.obs) enforce_canonical_format(self.adata) diff --git a/cellxgene_schema_cli/tests/fixtures/examples_validate.py b/cellxgene_schema_cli/tests/fixtures/examples_validate.py index 0bd619d7b..4b933bb78 100644 --- a/cellxgene_schema_cli/tests/fixtures/examples_validate.py +++ b/cellxgene_schema_cli/tests/fixtures/examples_validate.py @@ -3,9 +3,8 @@ import numpy import anndata import os -from base64 import b85encode from scipy import sparse -from xxhash import xxh3_64_intdigest +from cellxgene_schema.utils import get_hash_digest_column # -----------------------------------------------------------------# # General example information @@ -120,12 +119,7 @@ ], ) -obs_expected["observation_joinid"] = ( - obs_expected.index.to_series() - .map(xxh3_64_intdigest) - .astype(numpy.uint64) - .apply(lambda v: b85encode(v.to_bytes(8, "big")).decode("ascii")) -) +obs_expected["observation_joinid"] = get_hash_digest_column(obs_expected) # --- # 2. Creating individual var components: valid object and valid object and with labels