-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GEN-178] Update to pandas 2.0 and upgraded synapseclient #559
base: develop
Are you sure you want to change the base?
Changes from all commits
1d8f1b2
50212a7
e306ea9
eb18202
e2c2321
0e81107
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -172,50 +172,147 @@ def _update_table( | |
to_delete: bool = False, | ||
): | ||
""" | ||
Updates synapse tables by a row identifier with another | ||
dataset that has the same number and order of columns | ||
A helper function to compare new dataset with existing data, | ||
and store any changes that need to be made to the database | ||
""" | ||
changes = check_database_changes(database, new_dataset, primary_key_cols, to_delete) | ||
store_database( | ||
syn, | ||
database_synid, | ||
changes["col_order"], | ||
changes["allupdates"], | ||
changes["to_delete_rows"], | ||
) | ||
|
||
|
||
def _get_col_order(orig_database_cols: pd.Index) -> List[str]: | ||
""" | ||
Get column order | ||
|
||
Args: | ||
syn (synapseclient.Synaps): Synapse object | ||
database (pd.DataFrame): Original Data | ||
new_dataset (pd.DataFrame): New Data | ||
database_synid (str): Synapse Id of the Synapse table | ||
primary_key_cols (list): Column(s) that make up the primary key | ||
to_delete (bool, optional): Delete rows. Defaults to False | ||
orig_database_cols (pd.Index): A list of column names of the original database | ||
|
||
Returns: | ||
The list of re-ordered column names | ||
""" | ||
primary_key = "UNIQUE_KEY" | ||
database = database.fillna("") | ||
orig_database_cols = database.columns | ||
col_order = ["ROW_ID", "ROW_VERSION"] | ||
col_order.extend(orig_database_cols.tolist()) | ||
new_dataset = new_dataset.fillna("") | ||
# Columns must be in the same order | ||
return col_order | ||
|
||
|
||
def _reorder_new_dataset( | ||
orig_database_cols: pd.Index, new_dataset: pd.DataFrame | ||
) -> pd.DataFrame: | ||
""" | ||
Reorder new dataset based on the original database | ||
|
||
Args: | ||
orig_database_cols (pd.Index): A list of column names of the original database | ||
new_dataset(pd.DataFrame): New Data | ||
|
||
Returns: | ||
The re-ordered new dataset | ||
""" | ||
# Columns must be in the same order as the original data | ||
new_dataset = new_dataset[orig_database_cols] | ||
database[primary_key_cols] = database[primary_key_cols].applymap(str) | ||
database[primary_key] = database[primary_key_cols].apply( | ||
lambda x: " ".join(x), axis=1 | ||
) | ||
return new_dataset | ||
|
||
|
||
def _generate_primary_key( | ||
dataset: pd.DataFrame, primary_key_cols: List[str], primary_key: str | ||
) -> pd.DataFrame: | ||
""" | ||
Generate primary key column a dataframe | ||
|
||
Args: | ||
dataset(pd.DataFrame): A dataframe | ||
new_dataset: The re-ordered new dataset | ||
primary_key_cols (list): Column(s) that make up the primary key | ||
primary_key: The column name of the primary_key | ||
Returns: | ||
The dataframe with primary_key column added | ||
""" | ||
# replace NAs with emtpy string | ||
dataset = dataset.fillna("") | ||
# generate primary key column for original database | ||
dataset[primary_key_cols] = dataset[primary_key_cols].applymap(str) | ||
if dataset.empty: | ||
dataset[primary_key] = "" | ||
else: | ||
dataset[primary_key] = dataset[primary_key_cols].apply( | ||
lambda x: " ".join(x), axis=1 | ||
) | ||
return dataset | ||
|
||
new_dataset[primary_key_cols] = new_dataset[primary_key_cols].applymap(str) | ||
new_dataset[primary_key] = new_dataset[primary_key_cols].apply( | ||
lambda x: " ".join(x), axis=1 | ||
) | ||
|
||
def check_database_changes( | ||
database: pd.DataFrame, | ||
new_dataset: pd.DataFrame, | ||
primary_key_cols: List[str], | ||
to_delete: bool = False, | ||
) -> Dict[pd.DataFrame, List[str]]: | ||
""" | ||
Check changes that need to be made, i.e. append/update/delete rows to the database | ||
based on its comparison with new data | ||
|
||
Args: | ||
database (pd.DataFrame): Original Data | ||
new_dataset (pd.DataFrame): New Data | ||
primary_key_cols (list): Column(s) that make up the primary key | ||
to_delete (bool, optional): Delete rows. Defaults to False | ||
""" | ||
# get a list of column names of the original database | ||
orig_database_cols = database.columns | ||
# get the final column order | ||
col_order = _get_col_order(orig_database_cols) | ||
# reorder new_dataset | ||
new_dataset = _reorder_new_dataset(orig_database_cols, new_dataset) | ||
# set the primary_key name | ||
primary_key = "UNIQUE_KEY" | ||
# generate primary_key column for dataset comparison | ||
ori_data = _generate_primary_key(database, primary_key_cols, primary_key) | ||
new_data = _generate_primary_key(new_dataset, primary_key_cols, primary_key) | ||
# output dictionary | ||
changes = {"col_order": col_order, "allupdates": None, "to_delete_rows": None} | ||
# get rows to be appened or updated | ||
allupdates = pd.DataFrame(columns=col_order) | ||
to_append_rows = process_functions._append_rows(new_dataset, database, primary_key) | ||
to_update_rows = process_functions._update_rows(new_dataset, database, primary_key) | ||
to_append_rows = process_functions._append_rows(new_data, ori_data, primary_key) | ||
to_update_rows = process_functions._update_rows(new_data, ori_data, primary_key) | ||
allupdates = pd.concat([allupdates, to_append_rows, to_update_rows], sort=False) | ||
changes["allupdates"] = allupdates | ||
# get rows to be deleted | ||
if to_delete: | ||
to_delete_rows = process_functions._delete_rows( | ||
new_dataset, database, primary_key | ||
) | ||
to_delete_rows = process_functions._delete_rows(new_data, ori_data, primary_key) | ||
else: | ||
to_delete_rows = pd.DataFrame() | ||
allupdates = pd.concat([allupdates, to_append_rows, to_update_rows], sort=False) | ||
changes["to_delete_rows"] = to_delete_rows | ||
return changes | ||
|
||
|
||
def store_database( | ||
syn: synapseclient.Synapse, | ||
database_synid: str, | ||
col_order: List[str], | ||
allupdates: pd.DataFrame, | ||
to_delete_rows: pd.DataFrame, | ||
): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: return type and inconsistent parameter naming (all_updates) |
||
""" | ||
Store changes to the database | ||
|
||
Args: | ||
syn (synapseclient.Synaps): Synapse object | ||
database_synid (str): Synapse Id of the Synapse table | ||
col_order (List[str]): The ordered column names to be saved | ||
allupdates (pd.DataFrame): rows to be appended and/or updated | ||
to_deleted_rows (pd.DataFrame): rows to be deleted | ||
|
||
Returns: | ||
None | ||
""" | ||
storedatabase = False | ||
update_all_file = tempfile.NamedTemporaryFile( | ||
dir=process_functions.SCRIPT_DIR, delete=False | ||
) | ||
|
||
with open(update_all_file.name, "w") as updatefile: | ||
# Must write out the headers in case there are no appends or updates | ||
updatefile.write(",".join(col_order) + "\n") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
from io import StringIO | ||
import logging | ||
import os | ||
from typing import Optional | ||
from typing import Optional, Tuple | ||
|
||
import pandas as pd | ||
import synapseclient | ||
|
@@ -392,7 +392,7 @@ def preprocess(self, newpath): | |
"sample is True and inClinicalDb is True" | ||
) | ||
sample_cols = sample_cols_table.asDataFrame()["fieldName"].tolist() | ||
clinicalTemplate = pd.DataFrame(columns=set(patient_cols + sample_cols)) | ||
clinicalTemplate = pd.DataFrame(columns=list(set(patient_cols + sample_cols))) | ||
sample = True | ||
patient = True | ||
|
||
|
@@ -472,6 +472,68 @@ def process_steps( | |
newClinicalDf.to_csv(newPath, sep="\t", index=False) | ||
return newPath | ||
|
||
def _validate_oncotree_code_mapping( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: these don't use |
||
self: "Clinical", clinicaldf: pd.DataFrame, oncotree_mapping: pd.DataFrame | ||
) -> pd.Index: | ||
"""Checks that the oncotree codes in the input clinical | ||
data is a valid oncotree code from the official oncotree site | ||
|
||
Args: | ||
clinicaldf (pd.DataFrame): clinical input data to validate | ||
oncotree_mapping (pd.DataFrame): table of official oncotree | ||
mappings | ||
|
||
Returns: | ||
pd.Index: row indices of unmapped oncotree codes in the | ||
input clinical data | ||
""" | ||
# Make oncotree codes uppercase (SpCC/SPCC) | ||
clinicaldf["ONCOTREE_CODE"] = ( | ||
clinicaldf["ONCOTREE_CODE"].astype(str).str.upper() | ||
) | ||
|
||
unmapped_oncotrees = clinicaldf[ | ||
(clinicaldf["ONCOTREE_CODE"] != "UNKNOWN") | ||
& ~(clinicaldf["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"])) | ||
] | ||
return unmapped_oncotrees.index | ||
|
||
def _validate_oncotree_code_mapping_message( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment about |
||
self: "Clinical", | ||
clinicaldf: pd.DataFrame, | ||
unmapped_oncotree_indices: pd.DataFrame, | ||
) -> Tuple[str, str]: | ||
"""This function returns the error and warning messages | ||
if the input clinical data has row indices with unmapped | ||
oncotree codes | ||
|
||
Args: | ||
clinicaldf (pd.DataFrame): input clinical data | ||
unmapped_oncotree_indices (pd.DataFrame): row indices of the | ||
input clinical data with unmapped oncotree codes | ||
|
||
Returns: | ||
Tuple[str, str]: error message that tells you how many | ||
samples AND the unique unmapped oncotree codes that your | ||
input clinical data has | ||
""" | ||
errors = "" | ||
warnings = "" | ||
if len(unmapped_oncotree_indices) > 0: | ||
# sort the unique unmapped oncotree codes | ||
unmapped_oncotree_codes = sorted( | ||
set(clinicaldf.loc[unmapped_oncotree_indices]["ONCOTREE_CODE"]) | ||
) | ||
errors = ( | ||
"Sample Clinical File: Please double check that all your " | ||
"ONCOTREE CODES exist in the mapping. You have {} samples " | ||
"that don't map. These are the codes that " | ||
"don't map: {}\n".format( | ||
len(unmapped_oncotree_indices), ",".join(unmapped_oncotree_codes) | ||
) | ||
) | ||
return errors, warnings | ||
|
||
# VALIDATION | ||
def _validate(self, clinicaldf): | ||
""" | ||
|
@@ -641,28 +703,13 @@ def _validate(self, clinicaldf): | |
maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"] | ||
womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"] | ||
if haveColumn: | ||
# Make oncotree codes uppercase (SpCC/SPCC) | ||
clinicaldf["ONCOTREE_CODE"] = ( | ||
clinicaldf["ONCOTREE_CODE"].astype(str).str.upper() | ||
unmapped_indices = self._validate_oncotree_code_mapping( | ||
clinicaldf, oncotree_mapping | ||
) | ||
|
||
oncotree_codes = clinicaldf["ONCOTREE_CODE"][ | ||
clinicaldf["ONCOTREE_CODE"] != "UNKNOWN" | ||
] | ||
|
||
if not all(oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])): | ||
unmapped_oncotrees = oncotree_codes[ | ||
~oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"]) | ||
] | ||
total_error.write( | ||
"Sample Clinical File: Please double check that all your " | ||
"ONCOTREE CODES exist in the mapping. You have {} samples " | ||
"that don't map. These are the codes that " | ||
"don't map: {}\n".format( | ||
len(unmapped_oncotrees), | ||
",".join(set(unmapped_oncotrees)), | ||
) | ||
) | ||
errors, warnings = self._validate_oncotree_code_mapping_message( | ||
clinicaldf, unmapped_indices | ||
) | ||
total_error.write(errors) | ||
# Should add the SEX mismatch into the dashboard file | ||
if ( | ||
process_functions.checkColExist(clinicaldf, "SEX") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,8 +2,8 @@ | |
chardet>=3.0.4 | ||
# known working version 0.20.4 | ||
httplib2>=0.11.3 | ||
pandas>=1.0,<1.5.0 | ||
pandas==2.0.0 | ||
pyranges==0.0.115 | ||
# known working version 6.0 | ||
PyYAML>=5.1 | ||
synapseclient>=2.7.0,<3.0.0 | ||
synapseclient>=3.0.0,<4.0.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason <4.0.0? Would it be worth it to bump it all the way up to the 4 series? |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,8 +29,8 @@ project_urls = | |
[options] | ||
packages = find: | ||
install_requires = | ||
synapseclient>=2.7.0, <3.0.0 | ||
pandas>=1.0,<1.5.0 | ||
synapseclient>=3.0.0, <4.0.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment here about synapseclient, also either in this ticket or another ticket, bump the python versions we support. |
||
pandas==2.0.0 | ||
httplib2>=0.11.3 | ||
PyYAML>=5.1 | ||
chardet>=3.0.4 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: Make sure arguments match