Sage-Bionetworks · rxu17 · Apr 16, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 17, 2024
@@ -172,50 +172,147 @@ def _update_table(
     to_delete: bool = False,
 ):
     """
-    Updates synapse tables by a row identifier with another
-    dataset that has the same number and order of columns
+    A helper function to compare new dataset with existing data,
+    and store any changes that need to be made to the database
+    """
+    changes = check_database_changes(database, new_dataset, primary_key_cols, to_delete)
+    store_database(
+        syn,
+        database_synid,
+        changes["col_order"],
+        changes["allupdates"],
+        changes["to_delete_rows"],
+    )
+
+
+def _get_col_order(orig_database_cols: pd.Index) -> List[str]:
+    """
+    Get column order
 
     Args:
-        syn (synapseclient.Synaps): Synapse object
-        database (pd.DataFrame): Original Data
-        new_dataset (pd.DataFrame): New Data
-        database_synid (str): Synapse Id of the Synapse table
-        primary_key_cols (list): Column(s) that make up the primary key
-        to_delete (bool, optional): Delete rows. Defaults to False
+        orig_database_cols (pd.Index): A list of column names of the original database
+
+    Returns:
+        The list of re-ordered column names
     """
-    primary_key = "UNIQUE_KEY"
-    database = database.fillna("")
-    orig_database_cols = database.columns
     col_order = ["ROW_ID", "ROW_VERSION"]
     col_order.extend(orig_database_cols.tolist())
-    new_dataset = new_dataset.fillna("")
-    # Columns must be in the same order
+    return col_order
+
+
+def _reorder_new_dataset(
+    orig_database_cols: pd.Index, new_dataset: pd.DataFrame
+) -> pd.DataFrame:
+    """
+    Reorder new dataset based on the original database
+
+    Args:
+        orig_database_cols (pd.Index): A list of column names of the original database
+        new_dataset(pd.DataFrame): New Data
+
+    Returns:
+        The re-ordered new dataset
+    """
+    # Columns must be in the same order as the original data
     new_dataset = new_dataset[orig_database_cols]
-    database[primary_key_cols] = database[primary_key_cols].applymap(str)
-    database[primary_key] = database[primary_key_cols].apply(
-        lambda x: " ".join(x), axis=1
-    )
+    return new_dataset
+
+
+def _generate_primary_key(
+    dataset: pd.DataFrame, primary_key_cols: List[str], primary_key: str
+) -> pd.DataFrame:
+    """
+    Generate primary key column a dataframe
+
+    Args:
+        dataset(pd.DataFrame): A dataframe
+        new_dataset: The re-ordered new dataset
+        primary_key_cols (list): Column(s) that make up the primary key
+        primary_key: The column name of the primary_key
+    Returns:
+        The dataframe with primary_key column added
+    """
+    # replace NAs with emtpy string
+    dataset = dataset.fillna("")
+    # generate primary key column for original database
+    dataset[primary_key_cols] = dataset[primary_key_cols].applymap(str)
+    if dataset.empty:
+        dataset[primary_key] = ""
+    else:
+        dataset[primary_key] = dataset[primary_key_cols].apply(
+            lambda x: " ".join(x), axis=1
+        )
+    return dataset
 
-    new_dataset[primary_key_cols] = new_dataset[primary_key_cols].applymap(str)
-    new_dataset[primary_key] = new_dataset[primary_key_cols].apply(
-        lambda x: " ".join(x), axis=1
-    )
 
+def check_database_changes(
+    database: pd.DataFrame,
+    new_dataset: pd.DataFrame,
+    primary_key_cols: List[str],
+    to_delete: bool = False,
+) -> Dict[pd.DataFrame, List[str]]:
+    """
+    Check changes that need to be made, i.e. append/update/delete rows to the database
+    based on its comparison with new data
+
+    Args:
+        database (pd.DataFrame): Original Data
+        new_dataset (pd.DataFrame): New Data
+        primary_key_cols (list): Column(s) that make up the primary key
+        to_delete (bool, optional): Delete rows. Defaults to False
+    """
+    # get a list of column names of the original database
+    orig_database_cols = database.columns
+    # get the final column order
+    col_order = _get_col_order(orig_database_cols)
+    # reorder new_dataset
+    new_dataset = _reorder_new_dataset(orig_database_cols, new_dataset)
+    # set the primary_key name
+    primary_key = "UNIQUE_KEY"
+    # generate primary_key column for dataset comparison
+    ori_data = _generate_primary_key(database, primary_key_cols, primary_key)
+    new_data = _generate_primary_key(new_dataset, primary_key_cols, primary_key)
+    # output dictionary
+    changes = {"col_order": col_order, "allupdates": None, "to_delete_rows": None}
+    # get rows to be appened or updated
     allupdates = pd.DataFrame(columns=col_order)
-    to_append_rows = process_functions._append_rows(new_dataset, database, primary_key)
-    to_update_rows = process_functions._update_rows(new_dataset, database, primary_key)
+    to_append_rows = process_functions._append_rows(new_data, ori_data, primary_key)
+    to_update_rows = process_functions._update_rows(new_data, ori_data, primary_key)
+    allupdates = pd.concat([allupdates, to_append_rows, to_update_rows], sort=False)
+    changes["allupdates"] = allupdates
+    # get rows to be deleted
     if to_delete:
-        to_delete_rows = process_functions._delete_rows(
-            new_dataset, database, primary_key
-        )
+        to_delete_rows = process_functions._delete_rows(new_data, ori_data, primary_key)
     else:
         to_delete_rows = pd.DataFrame()
-    allupdates = pd.concat([allupdates, to_append_rows, to_update_rows], sort=False)
+    changes["to_delete_rows"] = to_delete_rows
+    return changes
+
+
+def store_database(
+    syn: synapseclient.Synapse,
+    database_synid: str,
+    col_order: List[str],
+    allupdates: pd.DataFrame,
+    to_delete_rows: pd.DataFrame,
+):
+    """
+    Store changes to the database
+
+    Args:
+        syn (synapseclient.Synaps): Synapse object
+        database_synid (str): Synapse Id of the Synapse table
+        col_order (List[str]): The ordered column names to be saved
+        allupdates (pd.DataFrame): rows to be appended and/or updated
+        to_deleted_rows (pd.DataFrame): rows to be deleted
+
+    Returns:
+        None
+    """
     storedatabase = False
     update_all_file = tempfile.NamedTemporaryFile(
         dir=process_functions.SCRIPT_DIR, delete=False
     )
-
     with open(update_all_file.name, "w") as updatefile:
         # Must write out the headers in case there are no appends or updates
         updatefile.write(",".join(col_order) + "\n")

@@ -5,7 +5,7 @@
 from io import StringIO
 import logging
 import os
-from typing import Optional
+from typing import Optional, Tuple
 
 import pandas as pd
 import synapseclient
@@ -392,7 +392,7 @@ def preprocess(self, newpath):
             "sample is True and inClinicalDb is True"
         )
         sample_cols = sample_cols_table.asDataFrame()["fieldName"].tolist()
-        clinicalTemplate = pd.DataFrame(columns=set(patient_cols + sample_cols))
+        clinicalTemplate = pd.DataFrame(columns=list(set(patient_cols + sample_cols)))
         sample = True
         patient = True
 
@@ -472,6 +472,68 @@ def process_steps(
         newClinicalDf.to_csv(newPath, sep="\t", index=False)
         return newPath
 
+    def _validate_oncotree_code_mapping(
+        self: "Clinical", clinicaldf: pd.DataFrame, oncotree_mapping: pd.DataFrame
+    ) -> pd.Index:
+        """Checks that the oncotree codes in the input clinical
+        data is a valid oncotree code from the official oncotree site
+
+        Args:
+            clinicaldf (pd.DataFrame): clinical input data to validate
+            oncotree_mapping (pd.DataFrame): table of official oncotree
+                mappings
+
+        Returns:
+            pd.Index: row indices of unmapped oncotree codes in the
+            input clinical data
+        """
+        # Make oncotree codes uppercase (SpCC/SPCC)
+        clinicaldf["ONCOTREE_CODE"] = (
+            clinicaldf["ONCOTREE_CODE"].astype(str).str.upper()
+        )
+
+        unmapped_oncotrees = clinicaldf[
+            (clinicaldf["ONCOTREE_CODE"] != "UNKNOWN")
+            & ~(clinicaldf["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"]))
+        ]
+        return unmapped_oncotrees.index
+
+    def _validate_oncotree_code_mapping_message(
+        self: "Clinical",
+        clinicaldf: pd.DataFrame,
+        unmapped_oncotree_indices: pd.DataFrame,
+    ) -> Tuple[str, str]:
+        """This function returns the error and warning messages
+        if the input clinical data has row indices with unmapped
+        oncotree codes
+
+        Args:
+            clinicaldf (pd.DataFrame): input clinical data
+            unmapped_oncotree_indices (pd.DataFrame): row indices of the
+                input clinical data with unmapped oncotree codes
+
+        Returns:
+            Tuple[str, str]: error message that tells you how many
+                samples AND the unique unmapped oncotree codes that your
+                input clinical data has
+        """
+        errors = ""
+        warnings = ""
+        if len(unmapped_oncotree_indices) > 0:
+            # sort the unique unmapped oncotree codes
+            unmapped_oncotree_codes = sorted(
+                set(clinicaldf.loc[unmapped_oncotree_indices]["ONCOTREE_CODE"])
+            )
+            errors = (
+                "Sample Clinical File: Please double check that all your "
+                "ONCOTREE CODES exist in the mapping. You have {} samples "
+                "that don't map. These are the codes that "
+                "don't map: {}\n".format(
+                    len(unmapped_oncotree_indices), ",".join(unmapped_oncotree_codes)
+                )
+            )
+        return errors, warnings
+
     # VALIDATION
     def _validate(self, clinicaldf):
         """
@@ -641,28 +703,13 @@ def _validate(self, clinicaldf):
         maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"]
         womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"]
         if haveColumn:
-            # Make oncotree codes uppercase (SpCC/SPCC)
-            clinicaldf["ONCOTREE_CODE"] = (
-                clinicaldf["ONCOTREE_CODE"].astype(str).str.upper()
+            unmapped_indices = self._validate_oncotree_code_mapping(
+                clinicaldf, oncotree_mapping
             )
-
-            oncotree_codes = clinicaldf["ONCOTREE_CODE"][
-                clinicaldf["ONCOTREE_CODE"] != "UNKNOWN"
-            ]
-
-            if not all(oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])):
-                unmapped_oncotrees = oncotree_codes[
-                    ~oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])
-                ]
-                total_error.write(
-                    "Sample Clinical File: Please double check that all your "
-                    "ONCOTREE CODES exist in the mapping. You have {} samples "
-                    "that don't map. These are the codes that "
-                    "don't map: {}\n".format(
-                        len(unmapped_oncotrees),
-                        ",".join(set(unmapped_oncotrees)),
-                    )
-                )
+            errors, warnings = self._validate_oncotree_code_mapping_message(
+                clinicaldf, unmapped_indices
+            )
+            total_error.write(errors)
             # Should add the SEX mismatch into the dashboard file
             if (
                 process_functions.checkColExist(clinicaldf, "SEX")

@@ -2,8 +2,8 @@
 chardet>=3.0.4
 # known working version 0.20.4
 httplib2>=0.11.3
-pandas>=1.0,<1.5.0
+pandas==2.0.0
 pyranges==0.0.115
 # known working version 6.0
 PyYAML>=5.1
-synapseclient>=2.7.0,<3.0.0
+synapseclient>=3.0.0,<4.0.0
@@ -29,8 +29,8 @@ project_urls =
 [options]
 packages = find:
 install_requires =
-    synapseclient>=2.7.0, <3.0.0
-    pandas>=1.0,<1.5.0
+    synapseclient>=3.0.0, <4.0.0
+    pandas==2.0.0
     httplib2>=0.11.3
     PyYAML>=5.1
     chardet>=3.0.4