expression update and bug fixes

sigven · Apr 4, 2024 · 69229a8 · 69229a8
1 parent 16737fe
commit 69229a8
Show file tree

Hide file tree

Showing 50 changed files with 1,931 additions and 1,359 deletions.
diff --git a/pcgr/annoutils.py b/pcgr/annoutils.py
@@ -36,6 +36,7 @@ def read_infotag_file(vcf_info_tags_tsv, scope = "vep"):
     for row in reader:
         if not row['tag'] in info_tag_xref:
             if scope in row['category']:
+                #print(scope + '\t' + str(row['category']) + '\t' + str(row['tag']))
                 info_tag_xref[row['tag']] = row
 
     return info_tag_xref

diff --git a/pcgr/arg_checker.py b/pcgr/arg_checker.py
diff --git a/pcgr/cna.py b/pcgr/cna.py
@@ -12,6 +12,7 @@
 from pcgr.annoutils import nuclear_chromosomes
 from pcgr.utils import error_message, warn_message, check_file_exists, remove_file
 from pcgr.biomarker import load_biomarkers
+from pcgr.expression import integrate_variant_expression
 
 def annotate_cna_segments(output_fname: str, 
                           output_dir: str, 
@@ -21,6 +22,7 @@ def annotate_cna_segments(output_fname: str,
                           sample_id: str,
                           n_copy_amplifications: int = 5,
                           overlap_fraction: float = 0.5,
+                          expression_data: dict = None,
                           logger: logging.Logger = None) -> int:
     """
     Annotate copy number aberrations in a given segment file.
@@ -31,9 +33,10 @@ def annotate_cna_segments(output_fname: str,
         db_assembly_dir (str): Path to the build-specific PCGR database directory.
         build (str): Genome assembly build of input data.
         sample_id( (str): Sample identifier
-        output_dir (str): Directory to save the annotated file.
         n_copy_amplifications (int, optional): Number of copies to consider as gains/amplifications. Defaults to 5.
+        expression_data (dict, optional): Expression data. Defaults to None.
         overlap_fraction (float, optional): Fraction of overlap required for annotation. Defaults to 0.5.
+        logger (logging.Logger, optional): Logger. Defaults to None.
     Returns:
         int: 0 if successful.
     """
@@ -181,6 +184,14 @@ def annotate_cna_segments(output_fname: str,
                 cna_query_segment_df['N_MINOR'].astype(str), sep=":")
 
     cna_query_segment_df['SAMPLE_ID'] = sample_id
+    if not expression_data is None:
+        cna_query_segment_df = integrate_variant_expression(cna_query_segment_df, expression_data, logger)
+    else:
+        logger.info("No expression data provided. Skipping CNA-expression integration")
+        cna_query_segment_df['TPM_GENE'] = '.'
+        cna_query_segment_df['TPM'] = '.'
+
+
     cna_query_segment_df.to_csv(output_fname, sep="\t", header=True, index=False)
 
     return 0
@@ -234,6 +245,9 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir
     cytoband_last_annotations.columns = ['last_cytoband','last_arm','last_arm_length','last_focal_threshold']
 
     cytoband_all = pd.concat([segments_cytoband, cytoband_first_annotations, cytoband_last_annotations], axis = 1)
+    #cytoband_all = pd.concat([segments_cytoband, cytoband_first_annotations], axis = 1)
+    #print(str(cytoband_all.head(3)))
+
     cytoband_all['segment_start'] = cytoband_all['segment_start'].astype(int)
     cytoband_all['segment_end'] = cytoband_all['segment_end'].astype(int)       
     cytoband_all.loc[:,'segment_length'] = cytoband_all['segment_end'] - cytoband_all['segment_start']
@@ -247,6 +261,7 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir
     cytoband_all['segment_name'] = cytoband_all['segment_name'].str.cat(cytoband_all['first_arm'], sep = "|").str.cat(
         cytoband_all['cytoband'],sep="|").str.cat(cytoband_all['event_type'], sep="|")    
     cytoband_annotated_segments = cytoband_all[['chromosome','segment_start','segment_end','segment_name']]
+    #print(str(cytoband_annotated_segments.head(3)))
 
     ## remove all temporary files
     for fname in temp_files:
@@ -255,8 +270,10 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, db_assembly_dir
     return cytoband_annotated_segments
 
 
-def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str,
-                         db_assembly_dir: str, overlap_fraction: float,
+def annotate_transcripts(cna_segments_bt: BedTool, 
+                         output_dir: str,
+                         db_assembly_dir: str, 
+                         overlap_fraction: float,
                          logger: logging.Logger) -> pd.DataFrame:
 
     """
@@ -399,5 +416,6 @@ def is_valid_cna(cna_segment_file, logger):
             err_msg = 'Detected wrongly formatted chromosomal segment - \'Start\' or \'End\' is less than or equal to zero (' + \
                 str(rec['Chromosome']) + ':' + str(rec['Start']) + '-' + str(rec['End']) + ')'
             return error_message(err_msg, logger)
-    logger.info(f'Copy number segment file ({cna_segment_file}) adheres to the correct format')
+
+    logger.info(f"Copy number segment file ('{os.path.basename(cna_segment_file)}') adheres to the correct format")
     return 0
diff --git a/pcgr/config.py b/pcgr/config.py
@@ -42,13 +42,16 @@ def create_config(arg_dict, workflow = "PCGR"):
             'force_overwrite': int(arg_dict['force_overwrite'])
         }
         conf_options['sample_properties'] = {}
-
-
+        conf_options['molecular_data'] = {}
+        conf_options['molecular_data']['fname_mut_vcf'] = "None"
+        conf_options['molecular_data']['fname_mut_tsv'] = "None"
+
     if workflow == 'PCGR':
         conf_options['assay_properties'] = {}
         conf_options['sample_properties']['purity'] = 'NA'
         conf_options['sample_properties']['ploidy'] = 'NA'
         conf_options['sample_properties']['site'] = str(pcgr_vars.tsites[arg_dict['tsite']])
+        conf_options['sample_properties']['site2'] = str(pcgr_vars.tsites[arg_dict['tsite']]).replace(" ","_").replace("/","@")
         conf_options['sample_properties']['dp_control_detected'] = 0
         conf_options['sample_properties']['vaf_control_detected'] = 0
         conf_options['sample_properties']['dp_tumor_detected'] = 0
@@ -77,8 +80,18 @@ def create_config(arg_dict, workflow = "PCGR"):
             'cna_overlap_pct': float(arg_dict['cna_overlap_pct']),
             'n_copy_gain': int(arg_dict['n_copy_gain'])
         }
+
+        conf_options['gene_expression'] = {}
+        conf_options['gene_expression']['similarity_analysis'] = int(arg_dict['expression_sim'])
+        conf_options['gene_expression']['similarity_db'] = {}
+        for db in arg_dict['expression_sim_db'].split(','):
+            conf_options['gene_expression']['similarity_db'][db] = 1
+            if db == 'tcga':
+                conf_options['gene_expression']['similarity_db']['tcga'] = {}
+                for cohort in pcgr_vars.TCGA_COHORTS:
+                    conf_options['gene_expression']['similarity_db']['tcga'][cohort] = 1
+
         conf_options['somatic_snv'] = {}
-
         conf_options['somatic_snv']['allelic_support'] = {
             'tumor_dp_min': int(arg_dict['tumor_dp_min']),
             'control_dp_min': int(arg_dict['control_dp_min']),
@@ -121,6 +134,13 @@ def create_config(arg_dict, workflow = "PCGR"):
             'include_artefact_signatures': int(arg_dict['include_artefact_signatures']),
             'prevalence_reference_signatures': int(arg_dict['prevalence_reference_signatures'])
         }
+
+
+        conf_options['molecular_data']['fname_cna_tsv'] = "None"
+        conf_options['molecular_data']['fname_expression_tsv'] = "None"
+        conf_options['molecular_data']['fname_tmb'] = "None"
+        for source in ['tcga','treehouse','depmap']:
+            conf_options['molecular_data']['fname_expression_sim_' + source] = "None"
 
 
     if workflow == "CPSR":        
@@ -162,32 +182,38 @@ def populate_config_data(conf_options: dict, db_dir: str, workflow = "PCGR", log
     conf_data['software'] = {}
     conf_data['software']['pcgr_version'] = pcgr_vars.PCGR_VERSION
     conf_data['software']['cpsr_version'] = pcgr_vars.PCGR_VERSION
+    conf_data['molecular_data'] = conf_options['molecular_data']
 
-    ## add paths to annotated files (VCF/TSV, CNA, TMB)
-    conf_data['molecular_data'] = {}
-    conf_data['molecular_data']['fname_mut_vcf'] = conf_options['annotated_vcf']
-    conf_data['molecular_data']['fname_mut_tsv'] = conf_options['annotated_tsv']
+    ## add paths to annotated files (VCF/TSV, CNA, TMB, EXPRESSION)
+    # conf_data['molecular_data'] = {}
+    # conf_data['molecular_data']['fname_mut_vcf'] = conf_options['annotated_vcf']
+    # conf_data['molecular_data']['fname_mut_tsv'] = conf_options['annotated_tsv']
+    # conf_data['molecular_data']['fname_cna_tsv'] = "None"
+    # conf_data['molecular_data']['fname_expression_tsv'] = "None"
+    # if workflow == "PCGR" and conf_options['annotated_cna'] != "None":
+    #     conf_data['molecular_data']['fname_cna_tsv'] = conf_options['annotated_cna']
+    #     del conf_options['annotated_cna']
+    # if workflow == "PCGR" and conf_options['annotated_exp'] != "None":
+    #     conf_data['molecular_data']['fname_expression_tsv'] = conf_options['annotated_exp']
+    #     del conf_options['annotated_exp']
+    # if workflow == "CPSR":
+    #     del conf_data['molecular_data']['fname_cna_tsv']
+    #     del conf_data['molecular_data']['fname_expression_tsv']
 
-    conf_data['molecular_data']['fname_cna_tsv'] = "None"
-    if workflow == "PCGR" and conf_options['annotated_cna'] != "None":
-        conf_data['molecular_data']['fname_cna_tsv'] = conf_options['annotated_cna']
-        del conf_options['annotated_cna']
-    if workflow == "CPSR":
-        del conf_data['molecular_data']['fname_cna_tsv']
-
-    conf_data['molecular_data']['fname_tmb'] = "None"
-    if workflow == "PCGR" and conf_options['fname_tmb'] != "None":
-        conf_data['molecular_data']['fname_tmb'] = conf_options['fname_tmb']
-        del conf_options['fname_tmb']
-    if workflow == "CPSR":
-        del conf_data['molecular_data']['fname_tmb']
+    # conf_data['molecular_data']['fname_tmb'] = "None"
+    # if workflow == "PCGR" and conf_options['fname_tmb'] != "None":
+    #     conf_data['molecular_data']['fname_tmb'] = conf_options['fname_tmb']
+    #     del conf_options['fname_tmb']
+    # if workflow == "CPSR":
+    #     del conf_data['molecular_data']['fname_tmb']
 
     genome_assembly = conf_options['genome_assembly']
     del conf_options['sample_id']
     del conf_options['genome_assembly']
     del conf_options['output_dir']
-    del conf_options['annotated_vcf']
-    del conf_options['annotated_tsv']
+    del conf_options['molecular_data']
+    #del conf_options['annotated_vcf']
+    #del conf_options['annotated_tsv']
     conf_data['conf'] = conf_options
 
     conf_data['reference_data'] = {}

diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py
@@ -185,8 +185,10 @@ def run_cpsr(conf_options, cpsr_paths):
         logger.info(f"Genome assembly: {conf_options['genome_assembly']}")
         print('----')
 
-        conf_options['annotated_tsv'] = output_pass_tsv_gz
-        conf_options['annotated_vcf'] = output_vcf
+        #conf_options['annotated_tsv'] = output_pass_tsv_gz
+        #conf_options['annotated_vcf'] = output_vcf
+        conf_options['molecular_data']['fname_mut_vcf'] = output_vcf
+        conf_options['molecular_data']['fname_mut_tsv'] = output_pass_tsv_gz
         conf_options['output_dir'] = output_dir
         conf_options['gene_panel']['custom_list_bed'] = "None"
         if not input_customlist == 'None':