Skip to content

Commit

Permalink
avoid floats in MAF int cols
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Dec 8, 2024
1 parent 04c6ab3 commit fbcf091
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 27 deletions.
45 changes: 21 additions & 24 deletions pcgr/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@

from pcgr.utils import check_file_exists, remove_file

def update_maf_allelic_support(maf_tmp_fname: str,
maf_fname: str,
allelic_support_tags: dict,
logger = None,
update_allelic_support = False
):
def update_maf(maf_tmp_fname: str,
maf_fname: str,
allelic_support_tags: dict,
logger = None,
update_allelic_support = False,
debug = False):

"""
Update MAF file from vcf2maf.pl with allelic support data (t_depth, t_ref_count, t_alt_count etc).
Expand All @@ -35,7 +35,7 @@ def update_maf_allelic_support(maf_tmp_fname: str,
header_line = f.readline().strip('\n')
f.close()

raw_maf_data = pd.read_csv(maf_tmp_fname, sep="\t", header=1, na_values=['.'], low_memory=False)
raw_maf_data = pd.read_csv(maf_tmp_fname, sep="\t", header=1, dtype='string',na_values=['.'], low_memory=False)
if update_allelic_support is False:
# write to file
b = 1
Expand All @@ -46,50 +46,47 @@ def update_maf_allelic_support(maf_tmp_fname: str,
if allelic_support_tags['tumor_dp_tag'] != "_NA_":
if {allelic_support_tags['tumor_dp_tag']}.issubset(raw_maf_data.columns):
if raw_maf_data[raw_maf_data[allelic_support_tags['tumor_dp_tag']].isna() == True].empty is True:
raw_maf_data = raw_maf_data.astype({allelic_support_tags['tumor_dp_tag']:'int'})
raw_maf_data.loc[:,"t_depth"] = raw_maf_data.loc[:,allelic_support_tags['tumor_dp_tag']]

if 'tumor_af_tag' in allelic_support_tags:
if allelic_support_tags['tumor_af_tag'] != "_NA_":
if {allelic_support_tags['tumor_af_tag']}.issubset(raw_maf_data.columns):

if {allelic_support_tags['tumor_af_tag']}.issubset(raw_maf_data.columns):
if raw_maf_data[raw_maf_data[allelic_support_tags['tumor_af_tag']].isna() == True].empty is True:
raw_maf_data['t_alt_count'] = None
raw_maf_data.loc[:,"t_alt_count"] = \
raw_maf_data.loc[:,allelic_support_tags['tumor_af_tag']] * \
raw_maf_data.loc[:,"t_depth"]
raw_maf_data.loc[:,allelic_support_tags['tumor_af_tag']].astype(float) * raw_maf_data.loc[:,"t_depth"].astype(int)

raw_maf_data.loc[:,"t_alt_count"] = raw_maf_data.loc[:,"t_alt_count"].round(0).astype(int)

raw_maf_data.loc[:,"t_alt_count"] = round(raw_maf_data.loc[:,"t_alt_count"].astype(float),0).astype(int)
raw_maf_data['t_ref_count'] = None
raw_maf_data.loc[:,"t_ref_count"] = \
raw_maf_data.loc[:,"t_depth"] - raw_maf_data.loc[:,"t_alt_count"]
raw_maf_data.loc[:,"t_depth"].astype(int) - raw_maf_data.loc[:,"t_alt_count"]

if 'control_dp_tag' in allelic_support_tags:
if allelic_support_tags['control_dp_tag'] != "_NA_":
if {allelic_support_tags['control_dp_tag']}.issubset(raw_maf_data.columns):
if raw_maf_data[raw_maf_data[allelic_support_tags['control_dp_tag']].isna() == True].empty is True:
raw_maf_data = raw_maf_data.astype({allelic_support_tags['control_dp_tag']:'int'})
raw_maf_data.loc[:,"n_depth"] = raw_maf_data.loc[:,allelic_support_tags['control_dp_tag']]

if 'control_af_tag' in allelic_support_tags:
if allelic_support_tags['control_af_tag'] != "_NA_":
if {allelic_support_tags['control_af_tag']}.issubset(raw_maf_data.columns):

if raw_maf_data[raw_maf_data[allelic_support_tags['control_af_tag']].isna() == True].empty is True:
raw_maf_data['n_alt_count'] = None
raw_maf_data.loc[:,"n_alt_count"] = \
raw_maf_data.loc[:,allelic_support_tags['control_af_tag']] * \
raw_maf_data.loc[:,"n_depth"]

raw_maf_data.loc[:,"n_alt_count"] = raw_maf_data.loc[:,"n_alt_count"].round(0).astype(int)
raw_maf_data.loc[:,allelic_support_tags['control_af_tag']].astype(float) * raw_maf_data.loc[:,"n_depth"].astype(int)

raw_maf_data.loc[:,"n_alt_count"] = round(raw_maf_data.loc[:,"n_alt_count"].astype(float),0).astype(int)
raw_maf_data['n_ref_count'] = None
raw_maf_data.loc[:,"n_ref_count"] = \
raw_maf_data.loc[:,"n_depth"] - raw_maf_data.loc[:,"n_alt_count"]
raw_maf_data.loc[:,"n_depth"].astype(int) - raw_maf_data.loc[:,"n_alt_count"]

raw_maf_data = raw_maf_data.fillna("")
#raw_maf_data = raw_maf_data.fillna("")
with open(maf_fname, 'w') as f:
f.write(f'{header_line}\n')
f.close()
raw_maf_data.to_csv(maf_fname, sep="\t", index=False, mode='a')
remove_file(maf_tmp_fname)
if not debug:
remove_file(maf_tmp_fname)



Expand Down
7 changes: 4 additions & 3 deletions pcgr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pcgr import pcgr_vars, arg_checker, utils, cna
from pcgr.utils import getlogger, check_subprocess, remove_file, random_id_generator
from pcgr.config import populate_config_data, create_config
from pcgr.maf import update_maf_allelic_support
from pcgr.maf import update_maf
from pcgr.vep import get_vep_command
from pcgr.expression import parse_expression, integrate_variant_expression, correlate_sample_expression
from pcgr.expression import find_expression_outliers, aggregate_tpm_per_cons
Expand Down Expand Up @@ -382,12 +382,13 @@ def run_pcgr(input_data, output_data, conf_options):

## add information on allelic support in MAF file
## (n_depth, n_ref_count, n_alt_count, t_depth, t_ref_count, t_alt_count)
update_maf_allelic_support(
update_maf(
maf_tmp_fname = output_tmp_maf,
maf_fname = output_maf,
allelic_support_tags = yaml_data["conf"]['somatic_snv']['allelic_support'],
logger = logger,
update_allelic_support = update_allelic_support
update_allelic_support = update_allelic_support,
debug = debug
)
logger.info('Finished pcgr-vep-vcf2maf')
print('----')
Expand Down

0 comments on commit fbcf091

Please sign in to comment.