Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GEN-178] Update to pandas 2.0 and upgraded synapseclient #559

Draft
wants to merge 6 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 125 additions & 28 deletions genie/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,50 +172,147 @@ def _update_table(
to_delete: bool = False,
):
"""
Updates synapse tables by a row identifier with another
dataset that has the same number and order of columns
A helper function to compare new dataset with existing data,
and store any changes that need to be made to the database
"""
changes = check_database_changes(database, new_dataset, primary_key_cols, to_delete)
store_database(
syn,
database_synid,
changes["col_order"],
changes["allupdates"],
changes["to_delete_rows"],
)


def _get_col_order(orig_database_cols: pd.Index) -> List[str]:
"""
Get column order

Args:
syn (synapseclient.Synaps): Synapse object
database (pd.DataFrame): Original Data
new_dataset (pd.DataFrame): New Data
database_synid (str): Synapse Id of the Synapse table
primary_key_cols (list): Column(s) that make up the primary key
to_delete (bool, optional): Delete rows. Defaults to False
orig_database_cols (pd.Index): A list of column names of the original database

Returns:
The list of re-ordered column names
"""
primary_key = "UNIQUE_KEY"
database = database.fillna("")
orig_database_cols = database.columns
col_order = ["ROW_ID", "ROW_VERSION"]
col_order.extend(orig_database_cols.tolist())
new_dataset = new_dataset.fillna("")
# Columns must be in the same order
return col_order


def _reorder_new_dataset(
orig_database_cols: pd.Index, new_dataset: pd.DataFrame
) -> pd.DataFrame:
"""
Reorder new dataset based on the original database

Args:
orig_database_cols (pd.Index): A list of column names of the original database
new_dataset(pd.DataFrame): New Data

Returns:
The re-ordered new dataset
"""
# Columns must be in the same order as the original data
new_dataset = new_dataset[orig_database_cols]
database[primary_key_cols] = database[primary_key_cols].applymap(str)
database[primary_key] = database[primary_key_cols].apply(
lambda x: " ".join(x), axis=1
)
return new_dataset


def _generate_primary_key(
dataset: pd.DataFrame, primary_key_cols: List[str], primary_key: str
) -> pd.DataFrame:
"""
Generate primary key column a dataframe

Args:
dataset(pd.DataFrame): A dataframe
new_dataset: The re-ordered new dataset
primary_key_cols (list): Column(s) that make up the primary key
primary_key: The column name of the primary_key
Comment on lines +228 to +231
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Make sure arguments match

Returns:
The dataframe with primary_key column added
"""
# replace NAs with emtpy string
dataset = dataset.fillna("")
# generate primary key column for original database
dataset[primary_key_cols] = dataset[primary_key_cols].applymap(str)
if dataset.empty:
dataset[primary_key] = ""
else:
dataset[primary_key] = dataset[primary_key_cols].apply(
lambda x: " ".join(x), axis=1
)
return dataset

new_dataset[primary_key_cols] = new_dataset[primary_key_cols].applymap(str)
new_dataset[primary_key] = new_dataset[primary_key_cols].apply(
lambda x: " ".join(x), axis=1
)

def check_database_changes(
database: pd.DataFrame,
new_dataset: pd.DataFrame,
primary_key_cols: List[str],
to_delete: bool = False,
) -> Dict[pd.DataFrame, List[str]]:
"""
Check changes that need to be made, i.e. append/update/delete rows to the database
based on its comparison with new data

Args:
database (pd.DataFrame): Original Data
new_dataset (pd.DataFrame): New Data
primary_key_cols (list): Column(s) that make up the primary key
to_delete (bool, optional): Delete rows. Defaults to False
"""
# get a list of column names of the original database
orig_database_cols = database.columns
# get the final column order
col_order = _get_col_order(orig_database_cols)
# reorder new_dataset
new_dataset = _reorder_new_dataset(orig_database_cols, new_dataset)
# set the primary_key name
primary_key = "UNIQUE_KEY"
# generate primary_key column for dataset comparison
ori_data = _generate_primary_key(database, primary_key_cols, primary_key)
new_data = _generate_primary_key(new_dataset, primary_key_cols, primary_key)
# output dictionary
changes = {"col_order": col_order, "allupdates": None, "to_delete_rows": None}
# get rows to be appened or updated
allupdates = pd.DataFrame(columns=col_order)
to_append_rows = process_functions._append_rows(new_dataset, database, primary_key)
to_update_rows = process_functions._update_rows(new_dataset, database, primary_key)
to_append_rows = process_functions._append_rows(new_data, ori_data, primary_key)
to_update_rows = process_functions._update_rows(new_data, ori_data, primary_key)
allupdates = pd.concat([allupdates, to_append_rows, to_update_rows], sort=False)
changes["allupdates"] = allupdates
# get rows to be deleted
if to_delete:
to_delete_rows = process_functions._delete_rows(
new_dataset, database, primary_key
)
to_delete_rows = process_functions._delete_rows(new_data, ori_data, primary_key)
else:
to_delete_rows = pd.DataFrame()
allupdates = pd.concat([allupdates, to_append_rows, to_update_rows], sort=False)
changes["to_delete_rows"] = to_delete_rows
return changes


def store_database(
syn: synapseclient.Synapse,
database_synid: str,
col_order: List[str],
allupdates: pd.DataFrame,
to_delete_rows: pd.DataFrame,
):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: return type and inconsistent parameter naming (all_updates)

"""
Store changes to the database

Args:
syn (synapseclient.Synaps): Synapse object
database_synid (str): Synapse Id of the Synapse table
col_order (List[str]): The ordered column names to be saved
allupdates (pd.DataFrame): rows to be appended and/or updated
to_deleted_rows (pd.DataFrame): rows to be deleted

Returns:
None
"""
storedatabase = False
update_all_file = tempfile.NamedTemporaryFile(
dir=process_functions.SCRIPT_DIR, delete=False
)

with open(update_all_file.name, "w") as updatefile:
# Must write out the headers in case there are no appends or updates
updatefile.write(",".join(col_order) + "\n")
Expand Down
93 changes: 70 additions & 23 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from io import StringIO
import logging
import os
from typing import Optional
from typing import Optional, Tuple

import pandas as pd
import synapseclient
Expand Down Expand Up @@ -392,7 +392,7 @@ def preprocess(self, newpath):
"sample is True and inClinicalDb is True"
)
sample_cols = sample_cols_table.asDataFrame()["fieldName"].tolist()
clinicalTemplate = pd.DataFrame(columns=set(patient_cols + sample_cols))
clinicalTemplate = pd.DataFrame(columns=list(set(patient_cols + sample_cols)))
sample = True
patient = True

Expand Down Expand Up @@ -472,6 +472,68 @@ def process_steps(
newClinicalDf.to_csv(newPath, sep="\t", index=False)
return newPath

def _validate_oncotree_code_mapping(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: these don't use self so you might be able to add these as @classmethod

self: "Clinical", clinicaldf: pd.DataFrame, oncotree_mapping: pd.DataFrame
) -> pd.Index:
"""Checks that the oncotree codes in the input clinical
data is a valid oncotree code from the official oncotree site

Args:
clinicaldf (pd.DataFrame): clinical input data to validate
oncotree_mapping (pd.DataFrame): table of official oncotree
mappings

Returns:
pd.Index: row indices of unmapped oncotree codes in the
input clinical data
"""
# Make oncotree codes uppercase (SpCC/SPCC)
clinicaldf["ONCOTREE_CODE"] = (
clinicaldf["ONCOTREE_CODE"].astype(str).str.upper()
)

unmapped_oncotrees = clinicaldf[
(clinicaldf["ONCOTREE_CODE"] != "UNKNOWN")
& ~(clinicaldf["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"]))
]
return unmapped_oncotrees.index

def _validate_oncotree_code_mapping_message(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment about classmethod

self: "Clinical",
clinicaldf: pd.DataFrame,
unmapped_oncotree_indices: pd.DataFrame,
) -> Tuple[str, str]:
"""This function returns the error and warning messages
if the input clinical data has row indices with unmapped
oncotree codes

Args:
clinicaldf (pd.DataFrame): input clinical data
unmapped_oncotree_indices (pd.DataFrame): row indices of the
input clinical data with unmapped oncotree codes

Returns:
Tuple[str, str]: error message that tells you how many
samples AND the unique unmapped oncotree codes that your
input clinical data has
"""
errors = ""
warnings = ""
if len(unmapped_oncotree_indices) > 0:
# sort the unique unmapped oncotree codes
unmapped_oncotree_codes = sorted(
set(clinicaldf.loc[unmapped_oncotree_indices]["ONCOTREE_CODE"])
)
errors = (
"Sample Clinical File: Please double check that all your "
"ONCOTREE CODES exist in the mapping. You have {} samples "
"that don't map. These are the codes that "
"don't map: {}\n".format(
len(unmapped_oncotree_indices), ",".join(unmapped_oncotree_codes)
)
)
return errors, warnings

# VALIDATION
def _validate(self, clinicaldf):
"""
Expand Down Expand Up @@ -641,28 +703,13 @@ def _validate(self, clinicaldf):
maleOncoCodes = ["TESTIS", "PROSTATE", "PENIS"]
womenOncoCodes = ["CERVIX", "VULVA", "UTERUS", "OVARY"]
if haveColumn:
# Make oncotree codes uppercase (SpCC/SPCC)
clinicaldf["ONCOTREE_CODE"] = (
clinicaldf["ONCOTREE_CODE"].astype(str).str.upper()
unmapped_indices = self._validate_oncotree_code_mapping(
clinicaldf, oncotree_mapping
)

oncotree_codes = clinicaldf["ONCOTREE_CODE"][
clinicaldf["ONCOTREE_CODE"] != "UNKNOWN"
]

if not all(oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])):
unmapped_oncotrees = oncotree_codes[
~oncotree_codes.isin(oncotree_mapping["ONCOTREE_CODE"])
]
total_error.write(
"Sample Clinical File: Please double check that all your "
"ONCOTREE CODES exist in the mapping. You have {} samples "
"that don't map. These are the codes that "
"don't map: {}\n".format(
len(unmapped_oncotrees),
",".join(set(unmapped_oncotrees)),
)
)
errors, warnings = self._validate_oncotree_code_mapping_message(
clinicaldf, unmapped_indices
)
total_error.write(errors)
# Should add the SEX mismatch into the dashboard file
if (
process_functions.checkColExist(clinicaldf, "SEX")
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
chardet>=3.0.4
# known working version 0.20.4
httplib2>=0.11.3
pandas>=1.0,<1.5.0
pandas==2.0.0
pyranges==0.0.115
# known working version 6.0
PyYAML>=5.1
synapseclient>=2.7.0,<3.0.0
synapseclient>=3.0.0,<4.0.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason <4.0.0? Would it be worth it to bump it all the way up to the 4 series?

4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ project_urls =
[options]
packages = find:
install_requires =
synapseclient>=2.7.0, <3.0.0
pandas>=1.0,<1.5.0
synapseclient>=3.0.0, <4.0.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment here about synapseclient, also either in this ticket or another ticket, bump the python versions we support.

pandas==2.0.0
httplib2>=0.11.3
PyYAML>=5.1
chardet>=3.0.4
Expand Down
Loading
Loading