From fbcf0918e5bf5cc864fac32c2d51f222e646bddb Mon Sep 17 00:00:00 2001 From: Sigve Nakken Date: Sun, 8 Dec 2024 14:29:52 +0100 Subject: [PATCH] avoid floats in MAF int cols --- pcgr/maf.py | 45 +++++++++++++++++++++------------------------ pcgr/main.py | 7 ++++--- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/pcgr/maf.py b/pcgr/maf.py index 320dbd76..04323109 100644 --- a/pcgr/maf.py +++ b/pcgr/maf.py @@ -7,12 +7,12 @@ from pcgr.utils import check_file_exists, remove_file -def update_maf_allelic_support(maf_tmp_fname: str, - maf_fname: str, - allelic_support_tags: dict, - logger = None, - update_allelic_support = False - ): +def update_maf(maf_tmp_fname: str, + maf_fname: str, + allelic_support_tags: dict, + logger = None, + update_allelic_support = False, + debug = False): """ Update MAF file from vcf2maf.pl with allelic support data (t_depth, t_ref_count, t_alt_count etc). @@ -35,7 +35,7 @@ def update_maf_allelic_support(maf_tmp_fname: str, header_line = f.readline().strip('\n') f.close() - raw_maf_data = pd.read_csv(maf_tmp_fname, sep="\t", header=1, na_values=['.'], low_memory=False) + raw_maf_data = pd.read_csv(maf_tmp_fname, sep="\t", header=1, dtype='string',na_values=['.'], low_memory=False) if update_allelic_support is False: # write to file b = 1 @@ -46,50 +46,47 @@ def update_maf_allelic_support(maf_tmp_fname: str, if allelic_support_tags['tumor_dp_tag'] != "_NA_": if {allelic_support_tags['tumor_dp_tag']}.issubset(raw_maf_data.columns): if raw_maf_data[raw_maf_data[allelic_support_tags['tumor_dp_tag']].isna() == True].empty is True: - raw_maf_data = raw_maf_data.astype({allelic_support_tags['tumor_dp_tag']:'int'}) raw_maf_data.loc[:,"t_depth"] = raw_maf_data.loc[:,allelic_support_tags['tumor_dp_tag']] if 'tumor_af_tag' in allelic_support_tags: if allelic_support_tags['tumor_af_tag'] != "_NA_": - if {allelic_support_tags['tumor_af_tag']}.issubset(raw_maf_data.columns): - + if {allelic_support_tags['tumor_af_tag']}.issubset(raw_maf_data.columns): if raw_maf_data[raw_maf_data[allelic_support_tags['tumor_af_tag']].isna() == True].empty is True: + raw_maf_data['t_alt_count'] = None raw_maf_data.loc[:,"t_alt_count"] = \ - raw_maf_data.loc[:,allelic_support_tags['tumor_af_tag']] * \ - raw_maf_data.loc[:,"t_depth"] + raw_maf_data.loc[:,allelic_support_tags['tumor_af_tag']].astype(float) * raw_maf_data.loc[:,"t_depth"].astype(int) - raw_maf_data.loc[:,"t_alt_count"] = raw_maf_data.loc[:,"t_alt_count"].round(0).astype(int) - + raw_maf_data.loc[:,"t_alt_count"] = round(raw_maf_data.loc[:,"t_alt_count"].astype(float),0).astype(int) + raw_maf_data['t_ref_count'] = None raw_maf_data.loc[:,"t_ref_count"] = \ - raw_maf_data.loc[:,"t_depth"] - raw_maf_data.loc[:,"t_alt_count"] + raw_maf_data.loc[:,"t_depth"].astype(int) - raw_maf_data.loc[:,"t_alt_count"] if 'control_dp_tag' in allelic_support_tags: if allelic_support_tags['control_dp_tag'] != "_NA_": if {allelic_support_tags['control_dp_tag']}.issubset(raw_maf_data.columns): if raw_maf_data[raw_maf_data[allelic_support_tags['control_dp_tag']].isna() == True].empty is True: - raw_maf_data = raw_maf_data.astype({allelic_support_tags['control_dp_tag']:'int'}) raw_maf_data.loc[:,"n_depth"] = raw_maf_data.loc[:,allelic_support_tags['control_dp_tag']] if 'control_af_tag' in allelic_support_tags: if allelic_support_tags['control_af_tag'] != "_NA_": if {allelic_support_tags['control_af_tag']}.issubset(raw_maf_data.columns): - if raw_maf_data[raw_maf_data[allelic_support_tags['control_af_tag']].isna() == True].empty is True: + raw_maf_data['n_alt_count'] = None raw_maf_data.loc[:,"n_alt_count"] = \ - raw_maf_data.loc[:,allelic_support_tags['control_af_tag']] * \ - raw_maf_data.loc[:,"n_depth"] - - raw_maf_data.loc[:,"n_alt_count"] = raw_maf_data.loc[:,"n_alt_count"].round(0).astype(int) + raw_maf_data.loc[:,allelic_support_tags['control_af_tag']].astype(float) * raw_maf_data.loc[:,"n_depth"].astype(int) + raw_maf_data.loc[:,"n_alt_count"] = round(raw_maf_data.loc[:,"n_alt_count"].astype(float),0).astype(int) + raw_maf_data['n_ref_count'] = None raw_maf_data.loc[:,"n_ref_count"] = \ - raw_maf_data.loc[:,"n_depth"] - raw_maf_data.loc[:,"n_alt_count"] + raw_maf_data.loc[:,"n_depth"].astype(int) - raw_maf_data.loc[:,"n_alt_count"] - raw_maf_data = raw_maf_data.fillna("") + #raw_maf_data = raw_maf_data.fillna("") with open(maf_fname, 'w') as f: f.write(f'{header_line}\n') f.close() raw_maf_data.to_csv(maf_fname, sep="\t", index=False, mode='a') - remove_file(maf_tmp_fname) + if not debug: + remove_file(maf_tmp_fname) diff --git a/pcgr/main.py b/pcgr/main.py index 168ba5b2..c7701331 100755 --- a/pcgr/main.py +++ b/pcgr/main.py @@ -3,7 +3,7 @@ from pcgr import pcgr_vars, arg_checker, utils, cna from pcgr.utils import getlogger, check_subprocess, remove_file, random_id_generator from pcgr.config import populate_config_data, create_config -from pcgr.maf import update_maf_allelic_support +from pcgr.maf import update_maf from pcgr.vep import get_vep_command from pcgr.expression import parse_expression, integrate_variant_expression, correlate_sample_expression from pcgr.expression import find_expression_outliers, aggregate_tpm_per_cons @@ -382,12 +382,13 @@ def run_pcgr(input_data, output_data, conf_options): ## add information on allelic support in MAF file ## (n_depth, n_ref_count, n_alt_count, t_depth, t_ref_count, t_alt_count) - update_maf_allelic_support( + update_maf( maf_tmp_fname = output_tmp_maf, maf_fname = output_maf, allelic_support_tags = yaml_data["conf"]['somatic_snv']['allelic_support'], logger = logger, - update_allelic_support = update_allelic_support + update_allelic_support = update_allelic_support, + debug = debug ) logger.info('Finished pcgr-vep-vcf2maf') print('----')