Skip to content

Commit

Permalink
expression update and bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Apr 4, 2024
1 parent 16737fe commit 69229a8
Show file tree
Hide file tree
Showing 50 changed files with 1,931 additions and 1,359 deletions.
1 change: 1 addition & 0 deletions pcgr/annoutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def read_infotag_file(vcf_info_tags_tsv, scope = "vep"):
for row in reader:
if not row['tag'] in info_tag_xref:
if scope in row['category']:
#print(scope + '\t' + str(row['category']) + '\t' + str(row['tag']))
info_tag_xref[row['tag']] = row

return info_tag_xref
Expand Down
201 changes: 146 additions & 55 deletions pcgr/arg_checker.py

Large diffs are not rendered by default.

26 changes: 22 additions & 4 deletions pcgr/cna.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pcgr.annoutils import nuclear_chromosomes
from pcgr.utils import error_message, warn_message, check_file_exists, remove_file
from pcgr.biomarker import load_biomarkers
from pcgr.expression import integrate_variant_expression

def annotate_cna_segments(output_fname: str,
output_dir: str,
Expand All @@ -21,6 +22,7 @@ def annotate_cna_segments(output_fname: str,
sample_id: str,
n_copy_amplifications: int = 5,
overlap_fraction: float = 0.5,
expression_data: dict = None,
logger: logging.Logger = None) -> int:
"""
Annotate copy number aberrations in a given segment file.
Expand All @@ -31,9 +33,10 @@ def annotate_cna_segments(output_fname: str,
db_assembly_dir (str): Path to the build-specific PCGR database directory.
build (str): Genome assembly build of input data.
sample_id( (str): Sample identifier
output_dir (str): Directory to save the annotated file.
n_copy_amplifications (int, optional): Number of copies to consider as gains/amplifications. Defaults to 5.
expression_data (dict, optional): Expression data. Defaults to None.
overlap_fraction (float, optional): Fraction of overlap required for annotation. Defaults to 0.5.
logger (logging.Logger, optional): Logger. Defaults to None.
Returns:
int: 0 if successful.
"""
Expand Down Expand Up @@ -181,6 +184,14 @@ def annotate_cna_segments(output_fname: str,
cna_query_segment_df['N_MINOR'].astype(str), sep=":")

cna_query_segment_df['SAMPLE_ID'] = sample_id
if not expression_data is None:
cna_query_segment_df = integrate_variant_expression(cna_query_segment_df, expression_data, logger)
else:
logger.info("No expression data provided. Skipping CNA-expression integration")
cna_query_segment_df['TPM_GENE'] = '.'
cna_query_segment_df['TPM'] = '.'


cna_query_segment_df.to_csv(output_fname, sep="\t", header=True, index=False)

return 0
Expand Down Expand Up @@ -234,6 +245,9 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir
cytoband_last_annotations.columns = ['last_cytoband','last_arm','last_arm_length','last_focal_threshold']

cytoband_all = pd.concat([segments_cytoband, cytoband_first_annotations, cytoband_last_annotations], axis = 1)
#cytoband_all = pd.concat([segments_cytoband, cytoband_first_annotations], axis = 1)
#print(str(cytoband_all.head(3)))

cytoband_all['segment_start'] = cytoband_all['segment_start'].astype(int)
cytoband_all['segment_end'] = cytoband_all['segment_end'].astype(int)
cytoband_all.loc[:,'segment_length'] = cytoband_all['segment_end'] - cytoband_all['segment_start']
Expand All @@ -247,6 +261,7 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir
cytoband_all['segment_name'] = cytoband_all['segment_name'].str.cat(cytoband_all['first_arm'], sep = "|").str.cat(
cytoband_all['cytoband'],sep="|").str.cat(cytoband_all['event_type'], sep="|")
cytoband_annotated_segments = cytoband_all[['chromosome','segment_start','segment_end','segment_name']]
#print(str(cytoband_annotated_segments.head(3)))

## remove all temporary files
for fname in temp_files:
Expand All @@ -255,8 +270,10 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir
return cytoband_annotated_segments


def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str,
db_assembly_dir: str, overlap_fraction: float,
def annotate_transcripts(cna_segments_bt: BedTool,
output_dir: str,
db_assembly_dir: str,
overlap_fraction: float,
logger: logging.Logger) -> pd.DataFrame:

"""
Expand Down Expand Up @@ -399,5 +416,6 @@ def is_valid_cna(cna_segment_file, logger):
err_msg = 'Detected wrongly formatted chromosomal segment - \'Start\' or \'End\' is less than or equal to zero (' + \
str(rec['Chromosome']) + ':' + str(rec['Start']) + '-' + str(rec['End']) + ')'
return error_message(err_msg, logger)
logger.info(f'Copy number segment file ({cna_segment_file}) adheres to the correct format')

logger.info(f"Copy number segment file ('{os.path.basename(cna_segment_file)}') adheres to the correct format")
return 0
70 changes: 48 additions & 22 deletions pcgr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,16 @@ def create_config(arg_dict, workflow = "PCGR"):
'force_overwrite': int(arg_dict['force_overwrite'])
}
conf_options['sample_properties'] = {}


conf_options['molecular_data'] = {}
conf_options['molecular_data']['fname_mut_vcf'] = "None"
conf_options['molecular_data']['fname_mut_tsv'] = "None"

if workflow == 'PCGR':
conf_options['assay_properties'] = {}
conf_options['sample_properties']['purity'] = 'NA'
conf_options['sample_properties']['ploidy'] = 'NA'
conf_options['sample_properties']['site'] = str(pcgr_vars.tsites[arg_dict['tsite']])
conf_options['sample_properties']['site2'] = str(pcgr_vars.tsites[arg_dict['tsite']]).replace(" ","_").replace("/","@")
conf_options['sample_properties']['dp_control_detected'] = 0
conf_options['sample_properties']['vaf_control_detected'] = 0
conf_options['sample_properties']['dp_tumor_detected'] = 0
Expand Down Expand Up @@ -77,8 +80,18 @@ def create_config(arg_dict, workflow = "PCGR"):
'cna_overlap_pct': float(arg_dict['cna_overlap_pct']),
'n_copy_gain': int(arg_dict['n_copy_gain'])
}

conf_options['gene_expression'] = {}
conf_options['gene_expression']['similarity_analysis'] = int(arg_dict['expression_sim'])
conf_options['gene_expression']['similarity_db'] = {}
for db in arg_dict['expression_sim_db'].split(','):
conf_options['gene_expression']['similarity_db'][db] = 1
if db == 'tcga':
conf_options['gene_expression']['similarity_db']['tcga'] = {}
for cohort in pcgr_vars.TCGA_COHORTS:
conf_options['gene_expression']['similarity_db']['tcga'][cohort] = 1

conf_options['somatic_snv'] = {}

conf_options['somatic_snv']['allelic_support'] = {
'tumor_dp_min': int(arg_dict['tumor_dp_min']),
'control_dp_min': int(arg_dict['control_dp_min']),
Expand Down Expand Up @@ -121,6 +134,13 @@ def create_config(arg_dict, workflow = "PCGR"):
'include_artefact_signatures': int(arg_dict['include_artefact_signatures']),
'prevalence_reference_signatures': int(arg_dict['prevalence_reference_signatures'])
}


conf_options['molecular_data']['fname_cna_tsv'] = "None"
conf_options['molecular_data']['fname_expression_tsv'] = "None"
conf_options['molecular_data']['fname_tmb'] = "None"
for source in ['tcga','treehouse','depmap']:
conf_options['molecular_data']['fname_expression_sim_' + source] = "None"


if workflow == "CPSR":
Expand Down Expand Up @@ -162,32 +182,38 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
conf_data['software'] = {}
conf_data['software']['pcgr_version'] = pcgr_vars.PCGR_VERSION
conf_data['software']['cpsr_version'] = pcgr_vars.PCGR_VERSION
conf_data['molecular_data'] = conf_options['molecular_data']

## add paths to annotated files (VCF/TSV, CNA, TMB)
conf_data['molecular_data'] = {}
conf_data['molecular_data']['fname_mut_vcf'] = conf_options['annotated_vcf']
conf_data['molecular_data']['fname_mut_tsv'] = conf_options['annotated_tsv']
## add paths to annotated files (VCF/TSV, CNA, TMB, EXPRESSION)
# conf_data['molecular_data'] = {}
# conf_data['molecular_data']['fname_mut_vcf'] = conf_options['annotated_vcf']
# conf_data['molecular_data']['fname_mut_tsv'] = conf_options['annotated_tsv']
# conf_data['molecular_data']['fname_cna_tsv'] = "None"
# conf_data['molecular_data']['fname_expression_tsv'] = "None"
# if workflow == "PCGR" and conf_options['annotated_cna'] != "None":
# conf_data['molecular_data']['fname_cna_tsv'] = conf_options['annotated_cna']
# del conf_options['annotated_cna']
# if workflow == "PCGR" and conf_options['annotated_exp'] != "None":
# conf_data['molecular_data']['fname_expression_tsv'] = conf_options['annotated_exp']
# del conf_options['annotated_exp']
# if workflow == "CPSR":
# del conf_data['molecular_data']['fname_cna_tsv']
# del conf_data['molecular_data']['fname_expression_tsv']

conf_data['molecular_data']['fname_cna_tsv'] = "None"
if workflow == "PCGR" and conf_options['annotated_cna'] != "None":
conf_data['molecular_data']['fname_cna_tsv'] = conf_options['annotated_cna']
del conf_options['annotated_cna']
if workflow == "CPSR":
del conf_data['molecular_data']['fname_cna_tsv']

conf_data['molecular_data']['fname_tmb'] = "None"
if workflow == "PCGR" and conf_options['fname_tmb'] != "None":
conf_data['molecular_data']['fname_tmb'] = conf_options['fname_tmb']
del conf_options['fname_tmb']
if workflow == "CPSR":
del conf_data['molecular_data']['fname_tmb']
# conf_data['molecular_data']['fname_tmb'] = "None"
# if workflow == "PCGR" and conf_options['fname_tmb'] != "None":
# conf_data['molecular_data']['fname_tmb'] = conf_options['fname_tmb']
# del conf_options['fname_tmb']
# if workflow == "CPSR":
# del conf_data['molecular_data']['fname_tmb']

genome_assembly = conf_options['genome_assembly']
del conf_options['sample_id']
del conf_options['genome_assembly']
del conf_options['output_dir']
del conf_options['annotated_vcf']
del conf_options['annotated_tsv']
del conf_options['molecular_data']
#del conf_options['annotated_vcf']
#del conf_options['annotated_tsv']
conf_data['conf'] = conf_options

conf_data['reference_data'] = {}
Expand Down
6 changes: 4 additions & 2 deletions pcgr/cpsr.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,10 @@ def run_cpsr(conf_options, cpsr_paths):
logger.info(f"Genome assembly: {conf_options['genome_assembly']}")
print('----')

conf_options['annotated_tsv'] = output_pass_tsv_gz
conf_options['annotated_vcf'] = output_vcf
#conf_options['annotated_tsv'] = output_pass_tsv_gz
#conf_options['annotated_vcf'] = output_vcf
conf_options['molecular_data']['fname_mut_vcf'] = output_vcf
conf_options['molecular_data']['fname_mut_tsv'] = output_pass_tsv_gz
conf_options['output_dir'] = output_dir
conf_options['gene_panel']['custom_list_bed'] = "None"
if not input_customlist == 'None':
Expand Down
Loading

0 comments on commit 69229a8

Please sign in to comment.