diff --git a/README.md b/README.md index 3db33068..7220e02a 100755 --- a/README.md +++ b/README.md @@ -24,24 +24,29 @@ Example screenshots from the [quarto](https://quarto.org)-based cancer genome re ![PCGR screenshot 2](pcgrr/pkgdown/assets/img/sc1.png) ![PCGR screenshot 3](pcgrr/pkgdown/assets/img/sc3.png) -PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](http://cancergenomics.no), at the [Institute for Cancer Research, Oslo University Hospital, Norway](http://radium.no). +PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](https://cancergenomics.no), at the [Institute for Cancer Research, Oslo University Hospital, Norway](https://radium.no). ### Top News +- *September 29th 2024*: **2.1.0 release** + - updated bundle, more oncogenic variants, CNA visualization, + improved RNA-seq support, bug fixes, and more + - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html) + - *August 1st 2024*: **2.0.3 release** - patch to fix purity/ploidy propagation, MAF output for tumor-only runs, and other minor issues - - [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) + - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html) - *July 16th 2024*: **2.0.2 release** - patch to ensure correct reference to actionability guidelines - - [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) + - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html) - *July 7th 2024*: **2.0.1 release** - patch with bug fix for mitochondrial input variants ([pr245](https://github.com/sigven/pcgr/pull/245)) - - [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) + - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html) - *June 2024*: **2.0.0 release** - - Details in [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) + - Details in [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html) - Massive reference data bundle upgrade, new report layout, oncogenicity classification++ - Support for Singularity/Apptainer - Major data/software updates: @@ -52,19 +57,9 @@ PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](http://ca - CancerMine `v50` (2023-03) - UniProt KB `v2024_03` -- *February 2023*: **1.3.0 release** - - Details in [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) - - prioritize protein-coding BIOTYPE csq ([pr201](https://github.com/sigven/pcgr/pull/201)) - - expose `--pcgrr_conda` option to flexibly activate pcgrr env via a non-default pcgrr name - - `cpsr_validate_input.py`: refactor for efficient custom gene egrep - -- *November 2022*: **1.2.0 release** - - Keep only autosomal, X, Y, M/MT chromosomes - - Import bcftools as dependency - ### Example reports -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12752833.svg)](https://doi.org/10.5281/zenodo.12752833) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13855988.svg)](https://doi.org/10.5281/zenodo.13855988) ### Why use PCGR? @@ -94,10 +89,12 @@ PCGR integrates a [comprehensive set of knowledge resources](https://sigven.gith ### Citation -If you use PCGR, please cite our publication: +If you use PCGR or CPSR, please cite our publications: Sigve Nakken, Ghislain Fournous, Daniel Vodák, Lars Birger Aaasheim, Ola Myklebost, and Eivind Hovig. **Personal Cancer Genome Reporter: variant interpretation report for precision oncology** (2017). *Bioinformatics*. 34(10):1778--1780. [doi.org/10.1093/bioinformatics/btx817](https://doi.org/10.1093/bioinformatics/btx817) +Sigve Nakken, Vladislav Saveliev, Oliver Hofmann, Pål Møller, Ola Myklebost, and Eivind Hovig. **Cancer Predisposition Sequencing Reporter (CPSR): a flexible variant report engine for high-throughput germline screening in cancer** (2021). *Int J Cancer*. [doi:[10.1002/ijc.33749](doi:%5B10.1002/ijc.33749)](https://doi.org/10.1002/ijc.33749) + ## Contact sigven AT ifi.uio.no diff --git a/pcgr/annoutils.py b/pcgr/annoutils.py index acf8126b..15d03b5c 100755 --- a/pcgr/annoutils.py +++ b/pcgr/annoutils.py @@ -10,6 +10,7 @@ from cyvcf2 import VCF, Writer from pcgr import utils, pcgr_vars from pcgr.utils import check_subprocess, error_message +from pcgr.variant import reverse_complement_dna csv.field_size_limit(500 * 1024 * 1024) threeLettertoOneLetterAA = {'Ala': 'A', 'Arg': 'R', 'Asn': 'N', 'Asp': 'D', 'Cys': 'C', 'Glu': 'E', 'Gln': 'Q', 'Gly': 'G', 'His': 'H', @@ -211,21 +212,30 @@ def assign_cds_exon_intron_annotations(csq_record, logger): csq_record['CDS_CHANGE'] = '.' csq_record['HGVSp_short'] = '.' csq_record['PROTEIN_CHANGE'] = '.' + csq_record['ALTERATION'] = '.' csq_record['EXON_AFFECTED'] = '.' csq_record['CDS_RELATIVE_POSITION'] = '.' csq_record['LOSS_OF_FUNCTION'] = False csq_record['LOF_FILTER'] = '.' splice_variant = False - + #print(csq_record.keys()) if re.search(pcgr_vars.CSQ_SPLICE_REGION_PATTERN, str(csq_record['Consequence'])) is not None: splice_variant = True if re.search(pcgr_vars.CSQ_CODING_PATTERN, str(csq_record['Consequence'])) is not None: csq_record['CODING_STATUS'] = 'coding' + + if re.search(pcgr_vars.CSQ_CODING_PATTERN2, str(csq_record['Consequence'])) is not None and \ + (csq_record['IMPACT'] == 'HIGH' or csq_record['IMPACT'] == 'MODERATE'): + csq_record['CODING_STATUS'] = 'coding' if re.search(pcgr_vars.CSQ_CODING_SILENT_PATTERN, str(csq_record['Consequence'])) is not None: csq_record['EXONIC_STATUS'] = 'exonic' + + if re.search(pcgr_vars.CSQ_CODING_SILENT_PATTERN2, str(csq_record['Consequence'])) is not None and \ + csq_record['IMPACT'] != 'MODIFIER': + csq_record['EXONIC_STATUS'] = 'exonic' if re.search(pcgr_vars.CSQ_LOF_PATTERN, str(csq_record['Consequence'])) is not None: csq_record['LOSS_OF_FUNCTION'] = True @@ -257,73 +267,119 @@ def assign_cds_exon_intron_annotations(csq_record, logger): csq_record['EXON_POSITION'] = int(exon_pos_info[1]) ## filter putative LOF variants if they occur too close to the CDS end (less than 5% of the CDS length remains after the variant) - if not csq_record['CDS_position'] is None and csq_record['LOSS_OF_FUNCTION'] is True and splice_variant is False: - if csq_record['CDS_position'] != '.': - if '/' in csq_record['CDS_position']: - cds_length = str(csq_record['CDS_position']).split('/')[1] - if cds_length.isdigit(): - cds_length = int(cds_length) - else: - cds_length = -1 + if 'CDS_position' in csq_record.keys(): + if not csq_record['CDS_position'] is None and csq_record['LOSS_OF_FUNCTION'] is True and splice_variant is False: + if csq_record['CDS_position'] != '.': + if '/' in csq_record['CDS_position']: + cds_length = str(csq_record['CDS_position']).split('/')[1] + if cds_length.isdigit(): + cds_length = int(cds_length) + else: + cds_length = -1 + + cds_pos = -1 + cds_pos_full = str(csq_record['CDS_position']).split('/')[0] + + ## Frameshift variants are listed with a range (separated by '-'), choose start position + if '-' in cds_pos_full and not '?' in cds_pos_full: + cds_pos = cds_pos_full.split('-')[0] + if cds_pos.isdigit(): + cds_pos = int(cds_pos) + #else: + # logger.warning(f'Could not determine variant CDS position from VEP annotation - ({csq_record["CDS_position"]})') + else: + if cds_pos_full.isdigit(): + cds_pos = int(cds_pos_full) + #else: + # logger.warning(f'Could not determine variant CDS position from VEP annotation - ({csq_record["CDS_position"]})') + + if int(cds_pos) > -1 and int(cds_pos) <= int(cds_length): + csq_record['CDS_RELATIVE_POSITION'] = float(cds_pos/cds_length) + + ## conservative filter: if putative loss-of-function variant is in the last 5% of the CDS, + ## it is considered a non-LoF variant + if csq_record['CDS_RELATIVE_POSITION'] >= 0.95: + csq_record['LOSS_OF_FUNCTION'] = False + csq_record['LOF_FILTER'] = "END_TRUNCATION" + + if 'HGVSc' in csq_record.keys() and 'Consequence' in csq_record.keys(): + if not csq_record['HGVSc'] is None: + if csq_record['HGVSc'] != '.': - cds_pos = -1 - cds_pos_full = str(csq_record['CDS_position']).split('/')[0] + if len(str(csq_record['HGVSc']).split(':')) == 2: + csq_record['ALTERATION'] = str(csq_record['HGVSc'].split(':')[1]) - ## Frameshift variants are listed with a range (separated by '-'), choose start position - if '-' in cds_pos_full and not '?' in cds_pos_full: - cds_pos = cds_pos_full.split('-')[0] - if cds_pos.isdigit(): - cds_pos = int(cds_pos) - #else: - # logger.warning(f'Could not determine variant CDS position from VEP annotation - ({csq_record["CDS_position"]})') - else: - if cds_pos_full.isdigit(): - cds_pos = int(cds_pos_full) - #else: - # logger.warning(f'Could not determine variant CDS position from VEP annotation - ({csq_record["CDS_position"]})') + ## Use RefSeq transcript ID (MANE SELECT) for HGVSc if available + csq_record['HGVSc_RefSeq'] = '.:.' + if not csq_record['MANE_SELECT'] is None: + if ":" in csq_record['HGVSc']: + hgvsc_data = csq_record['HGVSc'].split(':') + if len(hgvsc_data) == 2: + csq_record['HGVSc_RefSeq'] = str(csq_record['MANE_SELECT']) + ':' + str(hgvsc_data[1]) - if int(cds_pos) > -1 and int(cds_pos) <= int(cds_length): - csq_record['CDS_RELATIVE_POSITION'] = float(cds_pos/cds_length) + ## GRCh37 - MANE_SELECT not provided by VEP for GRCh37, so use MANE_SELECT2 (customly provided through geneOncoX) + else: + if 'MANE_SELECT2' in csq_record.keys(): + if not csq_record['MANE_SELECT2'] is None: + if ":" in csq_record['HGVSc']: + hgvsc_data = csq_record['HGVSc'].split(':') + if len(hgvsc_data) == 2: + csq_record['HGVSc_RefSeq'] = str(csq_record['MANE_SELECT2']) + ':' + str(hgvsc_data[1]) + else: + if 'REFSEQ_SELECT' in csq_record.keys(): + if not csq_record['REFSEQ_SELECT'] is None: + csq_record['REFSEQ_SELECT'] = str(csq_record['REFSEQ_SELECT'].split('&')[0]) + if ":" in csq_record['HGVSc']: + hgvsc_data = csq_record['HGVSc'].split(':') + if len(hgvsc_data) == 2: + csq_record['HGVSc_RefSeq'] = str(csq_record['REFSEQ_SELECT']) + ':' + str(hgvsc_data[1]) + else: + if 'REFSEQ_TRANSCRIPT_ID' in csq_record.keys(): + if not csq_record['REFSEQ_TRANSCRIPT_ID'] is None: + csq_record['REFSEQ_TRANSCRIPT_ID'] = str(csq_record['REFSEQ_TRANSCRIPT_ID'].split('&')[0]) + if ":" in csq_record['HGVSc']: + hgvsc_data = csq_record['HGVSc'].split(':') + if len(hgvsc_data) == 2: + csq_record['HGVSc_RefSeq'] = str(csq_record['REFSEQ_TRANSCRIPT_ID']) + ':' + str(hgvsc_data[1]) + + if 'splice_acceptor_variant' in csq_record['Consequence'] or 'splice_donor_variant' in csq_record['Consequence'] \ + or 'splice_donor_5th_base_variant' in csq_record['Consequence'] or 'splice_region_variant' in csq_record['Consequence'] \ + or 'splice_polypyrimidine_tract_variant' in csq_record['Consequence']: + key = str(csq_record['Consequence']) + \ + ':' + str(csq_record['HGVSc']) + csq_record['CDS_CHANGE'] = key - ## conservative filter: if putative loss-of-function variant is in the last 5% of the CDS, - ## it is considered a non-LoF variant - if csq_record['CDS_RELATIVE_POSITION'] >= 0.95: + ## GC to GT donor splice site variants are not considered loss-of-function + if 'splice_donor_variant' in str(csq_record['Consequence']) and csq_record['HGVSc'].endswith('+2C>T'): + csq_record['LOF_FILTER'] = "GC_TO_GT_DONOR" csq_record['LOSS_OF_FUNCTION'] = False - csq_record['LOF_FILTER'] = "END_TRUNCATION" - - if not csq_record['HGVSc'] is None: - if csq_record['HGVSc'] != '.': - if 'splice_acceptor_variant' in csq_record['Consequence'] or 'splice_donor_variant' in csq_record['Consequence'] \ - or 'splice_donor_5th_base_variant' in csq_record['Consequence'] or 'splice_region_variant' in csq_record['Consequence'] \ - or 'splice_polypyrimidine_tract_variant' in csq_record['Consequence']: - key = str(csq_record['Consequence']) + \ - ':' + str(csq_record['HGVSc']) - csq_record['CDS_CHANGE'] = key - - ## GC to GT donor splice site variants are not considered loss-of-function - if 'splice_donor_variant' in str(csq_record['Consequence']) and csq_record['HGVSc'].endswith('+2C>T'): - csq_record['LOF_FILTER'] = "GC_TO_GT_DONOR" - csq_record['LOSS_OF_FUNCTION'] = False - - if csq_record['Amino_acids'] is None or csq_record['Protein_position'] is None or csq_record['Consequence'] is None: - return(csq_record) - - if not csq_record['Protein_position'] is None: - if csq_record['Protein_position'].startswith('-'): - return(csq_record) protein_change = '.' - if '/' in csq_record['Protein_position']: - protein_position = str(csq_record['Protein_position'].split('/')[0]) - if '-' in protein_position: - if protein_position.split('-')[0].isdigit(): - csq_record['AMINO_ACID_START'] = protein_position.split('-')[0] - if protein_position.split('-')[1].isdigit(): - csq_record['AMINO_ACID_END'] = protein_position.split('-')[1] - else: - if protein_position.isdigit(): - csq_record['AMINO_ACID_START'] = protein_position - csq_record['AMINO_ACID_END'] = protein_position + + if 'Protein_position' in csq_record.keys(): + if not csq_record['Protein_position'] is None: + if not csq_record['Protein_position'].startswith('-') and csq_record['Protein_position'] != '.': + if '/' in csq_record['Protein_position']: + protein_position = str(csq_record['Protein_position'].split('/')[0]) + if '-' in protein_position: + if protein_position.split('-')[0].isdigit(): + csq_record['AMINO_ACID_START'] = protein_position.split('-')[0] + if protein_position.split('-')[1].isdigit(): + csq_record['AMINO_ACID_END'] = protein_position.split('-')[1] + else: + if protein_position.isdigit(): + csq_record['AMINO_ACID_START'] = protein_position + csq_record['AMINO_ACID_END'] = protein_position + + if 'synonymous_variant' in csq_record['Consequence'] and 'Amino_acids' in csq_record.keys(): + if not csq_record['Amino_acids'] is None: + protein_change = 'p.' + \ + str(csq_record['Amino_acids']) + \ + str(protein_position) + str(csq_record['Amino_acids']) + if 'stop_lost' in str(csq_record['Consequence']) and '/' in str(csq_record['Amino_acids']): + protein_change = 'p.X' + \ + str(protein_position) + \ + str(csq_record['Amino_acids']).split('/')[1] if not csq_record['HGVSp'] is None: if csq_record['HGVSp'] != '.': @@ -333,15 +389,26 @@ def assign_cds_exon_intron_annotations(csq_record, logger): protein_change_VEP = str(csq_record['HGVSp'].split(':')[1]) protein_change = threeToOneAA(protein_change_VEP) csq_record['PROTEIN_CHANGE'] = protein_change_VEP - - if 'synonymous_variant' in csq_record['Consequence']: - protein_change = 'p.' + \ - str(csq_record['Amino_acids']) + \ - str(protein_position) + str(csq_record['Amino_acids']) - if 'stop_lost' in str(csq_record['Consequence']) and '/' in str(csq_record['Amino_acids']): - protein_change = 'p.X' + \ - str(protein_position) + \ - str(csq_record['Amino_acids']).split('/')[1] + csq_record['ALTERATION'] = protein_change_VEP + + if 'Consequence' in csq_record.keys(): + if 'upstream_gene_variant' in csq_record['Consequence'] and \ + 'CDS_START' in csq_record.keys() and \ + 'VARKEY' in csq_record.keys() and \ + 'STRAND' in csq_record.keys() and \ + 'ENSEMBL_TRANSCRIPT_ID' in csq_record.keys() and \ + 'MANE_SELECT' in csq_record.keys(): + varkey_info = str(csq_record['VARKEY']).split('_') + if not csq_record['CDS_START'] is None: + if len(varkey_info) == 4: + csq_record['CDS_DISTANCE'] = abs(int(csq_record['CDS_START']) - int(csq_record['VARKEY'].split('_')[1])) + cds_dna_alteration = str(varkey_info[2]) + '>' + str(varkey_info[3]) + if csq_record['STRAND'] == '-1': + cds_dna_alteration = reverse_complement_dna(str(varkey_info[2])) + '>' + reverse_complement_dna(str(varkey_info[3])) + csq_record['ALTERATION'] = str('c.-' + str(csq_record['CDS_DISTANCE']) + str(cds_dna_alteration)) + csq_record['HGVSc'] = csq_record['ENSEMBL_TRANSCRIPT_ID'] + ':' + csq_record['ALTERATION'] + if not csq_record['MANE_SELECT'] is None: + csq_record['HGVSc_RefSeq'] = str(csq_record['MANE_SELECT']) + ':' + str(csq_record['ALTERATION']) csq_record['HGVSp_short'] = protein_change exon_number = 'NA' @@ -356,20 +423,25 @@ def assign_cds_exon_intron_annotations(csq_record, logger): if exon_number == num_exons: csq_record['LAST_EXON'] = True - if not csq_record['INTRON'] is None: - if csq_record['INTRON'] != '.': - if '/' in csq_record['INTRON']: - intron_number = str(csq_record['INTRON']).split('/')[0] - num_introns = str(csq_record['INTRON']).split('/')[1] - if intron_number == num_introns: - csq_record['LAST_INTRON'] = True + if 'INTRON' in csq_record.keys(): + if not csq_record['INTRON'] is None: + if csq_record['INTRON'] != '.': + if '/' in csq_record['INTRON']: + intron_number = str(csq_record['INTRON']).split('/')[0] + num_introns = str(csq_record['INTRON']).split('/')[1] + if intron_number == num_introns: + csq_record['LAST_INTRON'] = True if not csq_record['HGVSc'] is None: if csq_record['HGVSc'] != '.': if protein_change != '.': - key = str(csq_record['Consequence']) + ':' + str(csq_record['HGVSc']) + ':exon' + str(exon_number) + ':' + str(protein_change) + key = str(csq_record['Consequence']) + ':' + \ + str(csq_record['HGVSc']) + \ + ':exon' + str(exon_number) + \ + ':' + str(protein_change) csq_record['CDS_CHANGE'] = key + return(csq_record) diff --git a/pcgr/arg_checker.py b/pcgr/arg_checker.py index f2dc325b..8b87d92b 100644 --- a/pcgr/arg_checker.py +++ b/pcgr/arg_checker.py @@ -181,8 +181,8 @@ def verify_args(arg_dict): warn_message(warn_msg, logger) # Check that threshold for gains/amplifications are properly set, and that segment overlap with transcripts are set appropriately - if arg_dict['n_copy_gain'] <= 0: - err_msg = f"Totaly copy number threshold for gains/amplifications ('--n_copy_gain' = {arg_dict['n_copy_gain']}) should be > 0" + if arg_dict['n_copy_gain'] <= 2: + err_msg = f"Total copy number threshold for gains/amplifications ('--n_copy_gain' = {arg_dict['n_copy_gain']}) should be > 2" error_message(err_msg, logger) if arg_dict['cna_overlap_pct'] > 100 or arg_dict['cna_overlap_pct'] <= 0: err_msg = f"Minimum percent overlap between copy number segment and gene transcript ('--cna_overlap_pct' = {arg_dict['cna_overlap_pct']}) must be within (0, 100]" @@ -220,15 +220,11 @@ def define_output_files(arg_dict, cpsr = False): output_data['yaml']= f"{output_prefix}.conf.yaml" if not cpsr: - output_data['cna'] = f"{output_prefix}.cna_segments.tsv.gz" - output_data['expression'] = f"{output_prefix}.expression.tsv.gz" - output_data['csq_expression'] = f"{output_prefix}.csq_expression.tsv.gz" - output_data['expression_outliers'] = f"{output_prefix}.expression_outliers.tsv.gz" - output_data['expression_similarity'] = f"{output_prefix}.expression_similarity.tsv.gz" - output_data['snv_indel_ann'] = f"{output_prefix}.snv_indel_ann.tsv.gz" + for otype in ['cna_gene','cna_segment','expression','expression_outliers', + 'expression_similarity','snv_indel_ann','msigs']: + output_data[otype] = f"{output_prefix}.{otype}.tsv.gz" output_data['maf'] = f"{output_prefix}.maf" output_data['tmb'] = f"{output_prefix}.tmb.tsv" - output_data['msigs'] = f"{output_prefix}.msigs.tsv.gz" else: output_data['classification'] = f"{output_prefix}.classification.tsv.gz" @@ -240,7 +236,7 @@ def define_output_files(arg_dict, cpsr = False): error_message(err_msg, logger) if not cpsr: - for otype in ['cna', 'expression', 'expression_outliers', 'snv_indel_ann', + for otype in ['cna_gene', 'cna_segment','expression', 'expression_outliers', 'snv_indel_ann', 'expression_similarity','maf','tmb','msigs']: # if annotated output cna segments exist and overwrite not set if os.path.exists(output_data[otype]) and arg_dict["force_overwrite"] is False: @@ -267,6 +263,7 @@ def verify_input_files(arg_dict): input_cna_dir = 'NA' input_rna_fusion_dir = 'NA' input_germline_dir = 'NA' + input_germline_yaml_dir = 'NA' input_rna_expression_dir = 'NA' pon_vcf_dir = 'NA' db_dir = 'NA' @@ -277,6 +274,7 @@ def verify_input_files(arg_dict): input_rna_fusion_basename = 'NA' input_rna_expression_basename = 'NA' input_germline_basename = 'NA' + input_germline_yaml_basename = 'NA' arg_dict['rna_fusion_tumor'] = None # create output folder (if not already exists) @@ -366,19 +364,30 @@ def verify_input_files(arg_dict): os.path.abspath(arg_dict["input_rna_exp"])) # check if input germline calls (CPSR) exist - #if not arg_dict["input_germline"] is None: - # if not os.path.exists(os.path.abspath(arg_dict["input_germline"])): - # err_msg = "Input file (" + \ - # str(arg_dict["input_germline"]) + ") does not exist" - # error_message(err_msg, logger) - # if not (os.path.abspath(arg_dict["input_germline"]).endswith(".tsv.gz")): - # err_msg = "File with CPSR-classified germline calls (" + os.path.abspath( - # arg_dict["input_germline"]) + ") does not have the correct file extension (.json.gz)" - # error_message(err_msg, logger) - # input_germline_basename = os.path.basename( - # str(arg_dict["input_germline"])) - # input_germline_dir = os.path.dirname( - # os.path.abspath(arg_dict["input_germline"])) + if not arg_dict["input_cpsr"] is None: + if not os.path.exists(os.path.abspath(arg_dict["input_cpsr"])): + err_msg = "Input file (" + \ + str(arg_dict["input_cpsr"]) + ") does not exist" + error_message(err_msg, logger) + if not (os.path.abspath(arg_dict["input_cpsr"]).endswith(".tsv.gz")): + err_msg = "File with CPSR-classified germline calls (" + os.path.abspath( + arg_dict["input_cpsr"]) + ") does not have the correct file extension (.tsv.gz)" + error_message(err_msg, logger) + + if arg_dict["input_cpsr_yaml"] is None: + err_msg = "Input file with CPSR configuration settings (--input_cpsr_yaml) is missing" + error_message(err_msg, logger) + else: + check_file_exists(os.path.abspath(arg_dict["input_cpsr_yaml"]), strict = True, logger = logger) + input_germline_yaml_basename = os.path.basename( + str(arg_dict["input_cpsr_yaml"])) + input_germline_yaml_dir = os.path.dirname( + os.path.abspath(arg_dict["input_cpsr_yaml"])) + + input_germline_basename = os.path.basename( + str(arg_dict["input_cpsr"])) + input_germline_dir = os.path.dirname( + os.path.abspath(arg_dict["input_cpsr"])) vep_dir = verify_vep_cache(arg_dict, logger) refdata_assembly_dir = verify_refdata(arg_dict, logger, cpsr = False) @@ -391,6 +400,8 @@ def verify_input_files(arg_dict): "rna_expression_dir": input_rna_expression_dir, "germline_dir": input_germline_dir, "germline_basename": input_germline_basename, + "germline_yaml_dir": input_germline_yaml_dir, + "germline_yaml_basename": input_germline_yaml_basename, "pon_vcf_dir": pon_vcf_dir, "refdata_assembly_dir": refdata_assembly_dir, "vep_dir": vep_dir, diff --git a/pcgr/biomarker.py b/pcgr/biomarker.py index 31e3ea02..e7f1888a 100644 --- a/pcgr/biomarker.py +++ b/pcgr/biomarker.py @@ -184,10 +184,12 @@ def match_csq_biomarker(transcript_csq_elements, variant_biomarkers, rec, princi principal_csq_hgvsc = False for csq_elem in transcript_csq_elements: (consequence, symbol, entrezgene, hgvsc, hgvsp, exon, feature_type, feature, biotype) = csq_elem.split(':') - #print(csq_elem) - if bool(re.search(r'^(missense|stop|start|inframe|splice_donor|protein|splice_acceptor|frameshift)', consequence)) is True: + #if bool(re.search(r'^(missense|stop|start|inframe|protein|splice_donor|splice_acceptor|frameshift)', consequence)) is True: + # mut_protein = True + if bool(re.search(r'^(missense|stop|start|inframe|protein|frameshift)', consequence)) is True: mut_protein = True + hgvsp_short = threeToOneAA(hgvsp) @@ -228,12 +230,10 @@ def match_csq_biomarker(transcript_csq_elements, variant_biomarkers, rec, princi if len(codon_match) > 0: biomarker_key_codon = str(entrezgene) + '_' + str(codon_match[0]) - #print("CODON\t" + str(biomarker_key_codon)) ## match biomarkers annotated as "CODON" only for a given gene if biomarker_key_codon in variant_biomarkers['hgvsp']: hits_codon = variant_biomarkers['hgvsp'][biomarker_key_codon] - #print("CODON\t" + str(hits_codon)) for chit in hits_codon: if not chit['alteration_type'] == "CODON": continue @@ -273,8 +273,8 @@ def match_csq_biomarker(transcript_csq_elements, variant_biomarkers, rec, princi if entrezgene != "." and not rec.INFO.get('HGVSc') is None: hgvsc_elements = str(rec.INFO.get('HGVSc')).split(':') if len(hgvsc_elements) == 2: - hgvsc_biomarker_key = str(entrezgene) + '_' + str(hgvsc_elements[1]) - if hgvsc_biomarker_key in variant_biomarkers['hgvsc'].keys(): + hgvsc_biomarker_key = str(entrezgene) + '_' + str(hgvsc_elements[1]) + if hgvsc_biomarker_key in variant_biomarkers['hgvsc'].keys(): hits_hgvsc = variant_biomarkers['hgvsc'][hgvsc_biomarker_key] for hit_hgvsc in hits_hgvsc: hgvsc_hit = f"{hit_hgvsc['biomarker_source']}|{hit_hgvsc['variant_id']}|{hit_hgvsc['clinical_evidence_items']}" @@ -289,7 +289,6 @@ def match_csq_biomarker(transcript_csq_elements, variant_biomarkers, rec, princi ## Match biomarkers indicated by exon number (and consequence) - "exon level" resolution if entrezgene != "." and principal_csq_entrezgene is True and exon != ".": exon_biomarker_key = str(entrezgene) + '_' + str(exon) - #print("EXON\t" + str(exon_biomarker_key)) if exon_biomarker_key in variant_biomarkers['exon'].keys(): hits_exon = variant_biomarkers['exon'][exon_biomarker_key] diff --git a/pcgr/cna.py b/pcgr/cna.py index dcd4fd63..c12b707f 100644 --- a/pcgr/cna.py +++ b/pcgr/cna.py @@ -14,7 +14,8 @@ from pcgr.biomarker import load_biomarkers from pcgr.expression import integrate_variant_expression -def annotate_cna_segments(output_fname: str, +def annotate_cna_segments(output_segment_gene_fname: str, + output_segment_fname: str, output_dir: str, cna_segment_file: str, refdata_assembly_dir: str, @@ -27,7 +28,7 @@ def annotate_cna_segments(output_fname: str, """ Annotate copy number aberrations in a given segment file. Args: - output_fname (str): File name of the annotated output file. + output_segment_gene_fname (str): File name of the annotated output file. output_dir (str): Directory to save the annotated file. cna_segment_file (str): Path to the user-provided copy number aberrations segment file. refdata_assembly_dir (str): Path to the build-specific PCGR database directory. @@ -57,6 +58,10 @@ def annotate_cna_segments(output_fname: str, cna_query_segment_df = cna_query_segment_df[['Chromosome', 'Start','End','nMajor','nMinor']] + ## round nMajor and nMinor to integers + cna_query_segment_df['nMajor'] = cna_query_segment_df['nMajor'].round(0).astype(int) + cna_query_segment_df['nMinor'] = cna_query_segment_df['nMinor'].round(0).astype(int) + ## Remove 'chr' prefix from chromosome names for elem in ['Chromosome','nMajor','nMinor']: cna_query_segment_df = cna_query_segment_df.astype({elem:'string'}) @@ -68,7 +73,7 @@ def annotate_cna_segments(output_fname: str, if cna_query_segment_df.empty is True: warn_msg = f"Could not find any CNA query segments listed on nuclear chromosomes: {nuclear_chromosomes} - returning." warn_message(warn_msg, logger) - return 0 + return -1 ## Create segment identifier column cna_query_segment_df['segment_id'] = \ @@ -89,7 +94,8 @@ def annotate_cna_segments(output_fname: str, os.path.join(refdata_assembly_dir, 'chromsize.' + build + '.tsv') check_file_exists(chromsizes_fname, logger) - chromsizes = pd.read_csv(chromsizes_fname, sep="\t", header=None, names=['Chromosome', 'ChromLength']) + chromsizes = pd.read_csv(chromsizes_fname, sep="\t", skiprows=1, names=['Chromosome', 'ChromLength','CentromereStart','CentromereEnd']) + chromsizes = chromsizes[['Chromosome','ChromLength']] cna_query_segment_df = cna_query_segment_df.merge( chromsizes, left_on=["Chromosome"], right_on=["Chromosome"], how="left") segments_beyond_chromlength = \ @@ -97,13 +103,17 @@ def annotate_cna_segments(output_fname: str, ## Issue warning if segments exceed chromosome lengths if not segments_beyond_chromlength.empty is True: - warn_msg = f"Ignoring n = {len(segments_beyond_chromlength)} copy number segments that " + \ + warn_msg = f"Ignoring parts of n = {len(segments_beyond_chromlength)} copy number segments that " + \ f"exceed the chromosomal lengths of {build}" warn_message(warn_msg, logger) + + cna_query_segment_df.loc[cna_query_segment_df['End'] > cna_query_segment_df['ChromLength'],"End"] = \ + cna_query_segment_df.loc[cna_query_segment_df['End'] > cna_query_segment_df['ChromLength'],'ChromLength'] + cna_query_segment_df = \ - cna_query_segment_df[cna_query_segment_df['End'] <= cna_query_segment_df['ChromLength']] + cna_query_segment_df[cna_query_segment_df['Start'] <= cna_query_segment_df['ChromLength']] - cna_query_segment_df = cna_query_segment_df[['Chromosome','Start','End','Name']] + cna_query_segment_df = cna_query_segment_df[['Chromosome','Start','End','Name']] ## transform cna segments to pybedtools object cna_query_segment_bed = pybedtools.BedTool.from_dataframe(cna_query_segment_df) @@ -112,18 +122,31 @@ def annotate_cna_segments(output_fname: str, ## annotate segments with cytobands cna_query_segment_df = annotate_cytoband(cna_query_segment_bed, output_dir, refdata_assembly_dir, logger) + + cna_query_segment_df['chromosome2'] = cna_query_segment_df['chromosome'] + cna_query_segment_df.loc[cna_query_segment_df['chromosome2'] == "X","chromosome2"] = 23 + cna_query_segment_df.loc[cna_query_segment_df['chromosome2'] == "Y","chromosome2"] = 24 + cna_query_segment_df['chromosome2'] = cna_query_segment_df['chromosome2'].astype(int) + + cna_query_segment_df = cna_query_segment_df.sort_values(['chromosome2','segment_start'], ascending=True) + + cna_query_segment_df = cna_query_segment_df.drop(columns=['chromosome2']) + segments_out = cna_query_segment_df[['chromosome','segment_start','segment_end','segment_name']] + segments_out.columns = map(str.upper, segments_out.columns) + segments_out.rename(columns = {'CHROMOSOME':'CHROM'}, inplace = True) + segments_out.to_csv(output_segment_fname, sep="\t", header=True, index=False) + ## annotate with protein-coding transcripts cna_query_segment_bed = pybedtools.BedTool.from_dataframe(cna_query_segment_df) temp_files.append(cna_query_segment_bed.fn) - + cna_query_segment_df = annotate_transcripts( cna_query_segment_bed, output_dir, refdata_assembly_dir, overlap_fraction=overlap_fraction, logger=logger) - + ## load copy-number biomarker evidence + cna_query_segment_df['segment_length_mb'] = \ ((cna_query_segment_df['segment_end'] - cna_query_segment_df['segment_start']) / 1e6).astype(float).round(5) - ## load copy-number biomarker evidence - biomarkers = {} cna_actionable_dict = {} @@ -192,7 +215,7 @@ def annotate_cna_segments(output_fname: str, cna_query_segment_df['TPM'] = '.' - cna_query_segment_df.to_csv(output_fname, sep="\t", header=True, index=False) + cna_query_segment_df.to_csv(output_segment_gene_fname, sep="\t", header=True, index=False) return 0 @@ -245,8 +268,6 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, refdata_assembl cytoband_last_annotations.columns = ['last_cytoband','last_arm','last_arm_length','last_focal_threshold'] cytoband_all = pd.concat([segments_cytoband, cytoband_first_annotations, cytoband_last_annotations], axis = 1) - #cytoband_all = pd.concat([segments_cytoband, cytoband_first_annotations], axis = 1) - #print(str(cytoband_all.head(3))) cytoband_all['segment_start'] = cytoband_all['segment_start'].astype(int) cytoband_all['segment_end'] = cytoband_all['segment_end'].astype(int) @@ -261,7 +282,6 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, refdata_assembl cytoband_all['segment_name'] = cytoband_all['segment_name'].str.cat(cytoband_all['first_arm'], sep = "|").str.cat( cytoband_all['cytoband'],sep="|").str.cat(cytoband_all['event_type'], sep="|") cytoband_annotated_segments = cytoband_all[['chromosome','segment_start','segment_end','segment_name']] - #print(str(cytoband_annotated_segments.head(3))) ## remove all temporary files for fname in temp_files: @@ -402,10 +422,15 @@ def is_valid_cna(cna_segment_file, logger): err_msg = 'Copy number segment file is empty - contains NO segments' return error_message(err_msg, logger) - for elem in ['Start','End','nMajor','nMinor']: + for elem in ['Start','End']: if not cna_dataframe[elem].dtype.kind in 'i': err_msg = 'Copy number segment file contains non-integer values for column: "' + elem + '"' return error_message(err_msg, logger) + + for elem in ['nMajor','nMinor']: + if not cna_dataframe[elem].dtype.kind in 'if': + err_msg = 'Copy number segment file contains non-float/integer values for column: "' + elem + '"' + return error_message(err_msg, logger) for rec in cna_reader: if int(rec['End']) < int(rec['Start']): ## check that 'End' is always greather than 'Start' diff --git a/pcgr/config.py b/pcgr/config.py index 3b314cc9..d139f62c 100644 --- a/pcgr/config.py +++ b/pcgr/config.py @@ -33,6 +33,7 @@ def create_config(arg_dict, workflow = "PCGR"): conf_options['other'] = { 'vcfanno_n_proc': int(arg_dict['vcfanno_n_proc']), 'no_reporting': int(arg_dict['no_reporting']), + 'no_html': int(arg_dict['no_html']), 'retained_vcf_info_tags': str(arg_dict['retained_info_tags']), 'show_noncoding': not int(arg_dict['ignore_noncoding']), 'force_overwrite': int(arg_dict['force_overwrite']) @@ -69,11 +70,19 @@ def create_config(arg_dict, workflow = "PCGR"): # 'run': int(arg_dict['include_trials']) #} conf_options['other']['vcf2maf'] = int(arg_dict['vcf2maf']) + conf_options['somatic_cna'] = { 'cna_overlap_pct': float(arg_dict['cna_overlap_pct']), 'n_copy_gain': int(arg_dict['n_copy_gain']) } + conf_options['germline'] = { + 'show': 0, + 'ignore_vus': int(arg_dict['cpsr_ignore_vus']) + } + if not arg_dict['input_cpsr'] is None: + conf_options['germline']['show'] = 1 + conf_options['expression'] = {} conf_options['expression']['run'] = int(not arg_dict['input_rna_exp'] is None) conf_options['expression']['similarity_analysis'] = int(arg_dict['expression_sim']) @@ -129,11 +138,13 @@ def create_config(arg_dict, workflow = "PCGR"): 'prevalence_reference_signatures': float(arg_dict['prevalence_reference_signatures']) } - - conf_options['molecular_data']['fname_cna_tsv'] = "None" + conf_options['molecular_data']['fname_cna_gene_tsv'] = "None" + conf_options['molecular_data']['fname_cna_segment_tsv'] = "None" conf_options['molecular_data']['fname_expression_tsv'] = "None" conf_options['molecular_data']['fname_expression_outliers_tsv'] = "None" conf_options['molecular_data']['fname_maf_tsv'] = "None" + conf_options['molecular_data']['fname_germline_tsv'] = "None" + conf_options['molecular_data']['fname_germline_yaml'] = "None" #conf_options['molecular_data']['fname_expression_csq_tsv'] = "None" conf_options['molecular_data']['fname_expression_similarity_tsv'] = "None" conf_options['molecular_data']['fname_tmb_tsv'] = "None" @@ -149,6 +160,8 @@ def create_config(arg_dict, workflow = "PCGR"): conf_options['gene_panel'] = { 'panel_id': str(arg_dict['virtual_panel_id']), 'description': 'Exploratory virtual gene panel (panel 0)', + 'description_trait': 'None', + 'url': 'None', 'custom_list_tsv': str(arg_dict['custom_list']), 'custom_list_name': str(arg_dict['custom_list_name']), 'custom_list_bed': 'None', @@ -241,10 +254,13 @@ def populate_config_data(conf_options: dict, refdata_assembly_dir: str, workflow if conf_data['conf']['gene_panel']['panel_id'] == "-1": conf_data['conf']['gene_panel']['description'] = 'User-defined panel (custom geneset from panel 0)' + conf_data['conf']['gene_panel']['description_trait'] = 'User-defined panel (custom geneset from panel 0)' else: if ',' in conf_data['conf']['gene_panel']['panel_id']: conf_data['conf']['gene_panel']['description'] = \ 'Genomics England PanelApp - multiple panels (' + conf_data['conf']['gene_panel']['panel_id'] + ')' + conf_data['conf']['gene_panel']['description2'] = \ + 'Genomics England PanelApp - multiple panels (' + conf_data['conf']['gene_panel']['panel_id'] + ')' else: if conf_data['conf']['gene_panel']['panel_id'] != "0": conf_data['conf']['gene_panel']['description'] = \ @@ -257,6 +273,13 @@ def populate_config_data(conf_options: dict, refdata_assembly_dir: str, workflow conf_data['conf']['gene_panel']['custom_list_tsv'], bool(conf_data['conf']['variant_classification']['secondary_findings']), logger) + + if conf_data['conf']['gene_panel']['panel_id'] != "-1": + if not ',' in conf_data['conf']['gene_panel']['panel_id']: + conf_data['conf']['gene_panel']['url'] = str(conf_data['conf']['gene_panel']['panel_genes'][0]['panel_url']) + conf_data['conf']['gene_panel']['description_trait'] = str(conf_data['conf']['gene_panel']['panel_genes'][0]['panel_name']) + + return(conf_data) diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py index 96f8b656..e307b80b 100755 --- a/pcgr/cpsr.py +++ b/pcgr/cpsr.py @@ -45,6 +45,7 @@ def get_args(): optional_other.add_argument('--force_overwrite', action = "store_true", help='By default, the script will fail with an error if any output file already exists.\n You can force the overwrite of existing result files by using this flag, default: %(default)s') optional_other.add_argument('--version', action='version', version=str(utils.get_cpsr_version())) optional_other.add_argument('--no_reporting',action="store_true",help="Run functional variant annotation on VCF through VEP/vcfanno, omit classification/report generation (STEP 4), default: %(default)s") + optional_other.add_argument("--no_html", action="store_true", help="Do not generate HTML report (default: %(default)s)") optional_other.add_argument('--retained_info_tags', dest ='retained_info_tags', default='None', help='Comma-separated string of VCF INFO tags from query VCF that should be kept in CPSR output TSV') optional_other.add_argument('--ignore_noncoding', action='store_true',dest='ignore_noncoding',default=False,help='Ignore non-coding (i.e. non protein-altering) variants in report, default: %(default)s') optional_other.add_argument("--debug", action="store_true", help="Print full commands to log") @@ -63,7 +64,7 @@ def get_args(): optional_vep.add_argument('--vep_buffer_size', default = 500, type = int, help="Variant buffer size (variants read into memory simultaneously, option '--buffer_size' in VEP) " + \ "\n- set lower to reduce memory usage, default: %(default)s") optional_vep.add_argument("--vep_gencode_basic", action="store_true", help = "Consider basic GENCODE transcript set only with Variant Effect Predictor (VEP) (option '--gencode_basic' in VEP).") - optional_vep.add_argument('--vep_pick_order', default = "mane_select,mane_plus_clinical,canonical,appris,tsl,biotype,ccds,rank,length", help="Comma-separated string " + \ + optional_vep.add_argument('--vep_pick_order', default = "mane_select,mane_plus_clinical,canonical,biotype,ccds,rank,tsl,appris,length", help="Comma-separated string " + \ "of ordered transcript properties for primary variant pick\n ( option '--pick_order' in VEP), default: %(default)s") optional_vep.add_argument('--vep_no_intergenic', action = "store_true", help="Skip intergenic variants during processing (option '--no_intergenic' in VEP), default: %(default)s") @@ -99,10 +100,7 @@ def run_cpsr(conf_options, input_data, output_data): """ debug = conf_options['debug'] - vep_skip_intergenic_set = 'ON' if conf_options['vep']['vep_no_intergenic'] == 1 else 'OFF' - #output_vcf = 'None' - #output_pass_vcf = 'None' - #output_pass_tsv = 'None' + vep_skip_intergenic_set = 'ON' if conf_options['vep']['vep_no_intergenic'] == 1 else 'OFF' uid = '' genome_assembly = str(conf_options['genome_assembly']) input_vcf = 'None' @@ -245,7 +243,7 @@ def run_cpsr(conf_options, input_data, output_data): cpsr_summarise_command = ( f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {vep_vcfanno_summarised_vcf} 0 ' f'{yaml_data["conf"]["vep"]["vep_regulatory"]} 0 ' - f'Any {yaml_data["conf"]["vep"]["vep_pick_order"]} ' + f'Any {yaml_data["genome_assembly"]} {yaml_data["conf"]["vep"]["vep_pick_order"]} ' f'{input_data["refdata_assembly_dir"]} --compress_output_vcf ' f'--cpsr_yaml {yaml_fname} ' f'--cpsr {"--debug" if debug else ""}' diff --git a/pcgr/dbnsfp.py b/pcgr/dbnsfp.py index 97c662b8..0fdd0055 100755 --- a/pcgr/dbnsfp.py +++ b/pcgr/dbnsfp.py @@ -44,7 +44,11 @@ def map_variant_effect_predictors(rec, algorithms): 'aloft': 'DBNSFP_ALOFTPRED', 'bayesdel_addaf': 'DBNSFP_BAYESDEL_ADDAF', 'splice_site_ada': 'DBNSFP_SPLICE_SITE_ADA', - 'splice_site_rf': 'DBNSFP_SPLICE_SITE_RF' + 'splice_site_rf': 'DBNSFP_SPLICE_SITE_RF', + 'esm1b': 'DBNSFP_ESM1B', + 'alphamissense': 'DBNSFP_ALPHA_MISSENSE', + 'mutformer': 'DBNSFP_MUTFORMER', + 'phactboost': 'DBNSFP_PHACTBOOST' } if dbnsfp_key != '': @@ -58,6 +62,7 @@ def map_variant_effect_predictors(rec, algorithms): def map_dbnsfp_predictions(dbnsfp_tag, algorithms): effect_predictions = {} + for v in dbnsfp_tag.split(','): dbnsfp_info = v.split('|') if len(dbnsfp_info) == 1: @@ -85,8 +90,8 @@ def map_dbnsfp_predictions(dbnsfp_tag, algorithms): i = 6 v = 0 - - if len(algorithms) != len(dbnsfp_info[6:]): + + if len(algorithms) != len(dbnsfp_info[6:]): return effect_predictions while i < len(dbnsfp_info): diff --git a/pcgr/expression.py b/pcgr/expression.py index 94fb960e..e6478c81 100644 --- a/pcgr/expression.py +++ b/pcgr/expression.py @@ -37,7 +37,7 @@ def parse_expression(expression_fname_tsv: str, sample_gene_expression.sort_values(by=['ID','TPM'], ascending=[True, False], inplace = True) dup_ids = len(sample_gene_expression['ID']) - len(sample_gene_expression['ID'].drop_duplicates()) if dup_ids > 0: - logger.warn(f"Found N = {dup_ids} duplicate identifiers - resolving duplicates by keeping the highest TPM value") + logger.warning(f"Found N = {dup_ids} duplicate identifiers - resolving duplicates by keeping the highest TPM value") sample_gene_expression = sample_gene_expression.drop_duplicates(subset = ['ID']) ## Read the gene identifier index - maps transcript identifiers (Ensembl/Refseq), @@ -111,9 +111,9 @@ def parse_expression(expression_fname_tsv: str, ## Emit warning if more than 5% of gene/transcript identifiers are not properly verified sample_identifiers_found = len(exp_map_verified) percent_verified = round((len(exp_map_verified) / len(exp_map)) * 100, 2) - percent_missing = 100 - percent_verified + percent_missing = round(100 - percent_verified, 2) if percent_missing > 5: - logger.warn("Failed to map " + str(percent_missing) + \ + logger.warning("Failed to map " + str(percent_missing) + \ "% of gene/transcript identifiers in input TSV file - use proper ENST/RefSeq identifiers") logger.info("Verified N = " + str(sample_identifiers_found) + " (" + str(percent_verified) + \ "%) of gene/transcript identifiers in input gene expression file - using " + str(identifiers_used_in_input)) @@ -122,7 +122,7 @@ def parse_expression(expression_fname_tsv: str, ## remove them from the analysis (write them to a separate file?) n_ambig = len(exp_map_verified[exp_map_verified.AMBIGUOUS_ID == True]) if n_ambig > 0: - logger.warn("Detected N = " + str(n_ambig) + " ambiguous gene/transcript identifiers in input gene expression file") + logger.warning("Detected N = " + str(n_ambig) + " ambiguous gene/transcript identifiers in input gene expression file") else: logger.info("NO ambiguous gene/transcript identifiers were detected in input gene expression file") transcript_expression_map = exp_map_verified[exp_map_verified.AMBIGUOUS_ID == False] @@ -160,9 +160,10 @@ def parse_expression(expression_fname_tsv: str, ## make gene level TPM summary expression_map['gene'] = transcript_expression_map.groupby( ['ENSEMBL_GENE_ID','SYMBOL','ENTREZGENE','GENENAME','BIOTYPE']).agg({'TPM':'sum'}).reset_index() - expression_map['gene'].columns = ['ENSEMBL_GENE_ID','SYMBOL','ENTREZGENE','GENENAME','BIOTYPE','TPM_GENE'] - - + ## add log2(TPM + 0.001) for gene-level TPM values (all reference TPM values are in log2(TPM + 0.001)) + expression_map['gene']['TPM_LOG2_GENE'] = np.log2(expression_map['gene']['TPM'] + 0.001) + expression_map['gene'].columns = ['ENSEMBL_GENE_ID','SYMBOL','ENTREZGENE','GENENAME','BIOTYPE','TPM_GENE','TPM_LOG2_GENE'] + expression_map['gene'] = expression_map['gene'].drop_duplicates().sort_values(by='TPM_GENE', ascending=False) return(expression_map) @@ -184,7 +185,7 @@ def integrate_variant_expression(variant_set: pd.DataFrame, if s == 'gene': logger.info("Integrating gene-level expression data from tumor into somatic variant set") if expression_data[s].empty: - logger.warn('Expression file does not contain any gene-level expression data') + logger.warning('Expression file does not contain any gene-level expression data') else: v = 'TPM_MIN' if s == 'gene': @@ -200,7 +201,7 @@ def integrate_variant_expression(variant_set: pd.DataFrame, if not expression_data['transcript'] is None: logger.info("Integrating transcript-level expression data from tumor into somatic variant set") if expression_data['transcript'].empty: - logger.warn('Expression file does not contain any transcript-level expression data') + logger.warning('Expression file does not contain any transcript-level expression data') if 'TPM_GENE' in variant_set.columns: variant_set = variant_set.assign(TPM = variant_set['TPM_GENE']) else: @@ -221,9 +222,9 @@ def integrate_variant_expression(variant_set: pd.DataFrame, else: variant_set['TPM'] = np.nan - logger.warn('Variant file does not contain any entries with valid transcript identifiers') + logger.warning('Variant file does not contain any entries with valid transcript identifiers') else: - logger.warn('Expression file does not contain any transcript-level expression data') + logger.warning('Expression file does not contain any transcript-level expression data') if 'TPM_GENE' in variant_set.columns: variant_set = variant_set.assign(TPM = variant_set['TPM_GENE']) @@ -242,10 +243,10 @@ def aggregate_tpm_per_cons(variant_set: pd.DataFrame, if 'transcript' in expression_data.keys(): if expression_data['transcript'].empty: - logger.warn('Expression file does not contain any transcript-level expression data') + logger.warning('Expression file does not contain any transcript-level expression data') return(cons2exp) if variant_set.empty: - logger.warn('Variant file does not contain any entries with valid transcript identifiers') + logger.warning('Variant file does not contain any entries with valid transcript identifiers') return(cons2exp) if {'VAR_ID','VEP_ALL_CSQ'}.issubset(variant_set.columns) and \ {'ENSEMBL_TRANSCRIPT_ID','TPM'}.issubset(expression_data['transcript'].columns): @@ -259,7 +260,7 @@ def aggregate_tpm_per_cons(variant_set: pd.DataFrame, varset['VEP_ALL_CSQ'].str.split(':', expand=True) varset = varset[varset["ENSEMBL_TRANSCRIPT_ID"].str.contains("ENST")] if varset.empty: - logger.warn('Variant file does not contain any entries with valid transcript identifiers') + logger.warning('Variant file does not contain any entries with valid transcript identifiers') return(cons2exp) varset = pd.merge(varset, trans_expression, on = 'ENSEMBL_TRANSCRIPT_ID', how = 'left') varset = varset.loc[~varset['TPM'].isna(), :] @@ -296,19 +297,19 @@ def correlate_sample_expression(sample_expression: dict, for k in yaml_data['conf']['expression']['similarity_db'].keys(): exp_sim[k] = pd.DataFrame() sample_id = yaml_data['sample_id'] - drop_columns = ['SYMBOL','BIOTYPE', 'GENENAME','ENTREZGENE'] + drop_columns = ['SYMBOL','BIOTYPE', 'GENENAME','ENTREZGENE','TPM_GENE'] exp_data_sample = sample_expression.copy() if 'gene' in exp_data_sample.keys(): if not exp_data_sample['gene'] is None: - if {'ENSEMBL_GENE_ID','TPM_GENE','BIOTYPE'}.issubset(exp_data_sample['gene'].columns): + if {'ENSEMBL_GENE_ID','TPM_LOG2_GENE','BIOTYPE'}.issubset(exp_data_sample['gene'].columns): if protein_coding_only is True: logger.info("Filtering out non-protein coding genes from expression data") exp_data_sample['gene'] = \ exp_data_sample['gene'][exp_data_sample['gene']['BIOTYPE'] == 'protein_coding'] if len(exp_data_sample['gene']) < 10: - logger.warn( + logger.warning( 'Expression file contains limited protein-coding gene expression records (N = ' + \ str(len(exp_data_sample['gene'])) + ') - skipping correlation analysis') return(exp_sim) @@ -317,7 +318,7 @@ def correlate_sample_expression(sample_expression: dict, if col in exp_data_sample['gene'].columns: exp_data_sample['gene'] = exp_data_sample['gene'].drop(col, axis = 1) exp_data_sample['gene'] = exp_data_sample['gene'].rename( - columns = {'TPM_GENE':sample_id}) + columns = {'TPM_LOG2_GENE':sample_id}) if 'tcga' in yaml_data['conf']['expression']['similarity_db'].keys(): for cohort in yaml_data['conf']['expression']['similarity_db']['tcga'].keys(): @@ -393,11 +394,12 @@ def correlate_sample_expression(sample_expression: dict, def find_expression_outliers(sample_expression: dict, yaml_data: dict, refdata_assembly_dir: str, + protein_coding_only: bool, logger: logging.Logger) -> pd.DataFrame: sample_id = yaml_data['sample_id'] primary_site = yaml_data['conf']['sample_properties']['site'] - required_cols = ['ENSEMBL_GENE_ID','TPM_GENE'] + required_cols = ['ENSEMBL_GENE_ID','TPM_LOG2_GENE'] drop_columns = ['SYMBOL','BIOTYPE', 'GENENAME','ENTREZGENE'] exp_data_sample = sample_expression.copy() @@ -409,7 +411,7 @@ def find_expression_outliers(sample_expression: dict, ## Future - allow user to specify TCGA cohort to compare with ## if primary_site == "Any": - logger.warn("Primary site not specified in configuration file - skipping expression outlier analysis") + logger.warning("Primary site not specified in configuration file - skipping expression outlier analysis") return(pd.DataFrame()) else: if primary_site in pcgr_vars.SITE_TO_DISEASE.keys(): @@ -419,32 +421,39 @@ def find_expression_outliers(sample_expression: dict, "tcga", str(comparison_disease_cohort).lower() + "_tpm.tsv.gz") if check_file_exists(exp_fname, strict = False, logger = logger): exp_data_refcohort = pd.read_csv(exp_fname, sep = "\t", na_values = ".", low_memory = False) + + ## Filter for protein coding genes when doing outlier analysis + if protein_coding_only: + #logger.info("Filtering out non-protein coding genes from expression data") + exp_data_refcohort = \ + exp_data_refcohort[exp_data_refcohort['BIOTYPE'] == 'protein_coding'] for col in exp_data_refcohort.columns: if col in drop_columns: exp_data_refcohort = exp_data_refcohort.drop(col, axis = 1) else: - logger.warn(f'{exp_fname} not found - skipping expression outlier analysis') + logger.warning(f'{exp_fname} not found - skipping expression outlier analysis') return(pd.DataFrame()) else: - logger.warn("Primary tumor site not specified in configuration file - skipping expression outlier analysis") + logger.warning("Primary tumor site not specified in configuration file - skipping expression outlier analysis") return(pd.DataFrame()) if 'gene' in exp_data_sample.keys(): if not exp_data_sample['gene'] is None: - if {'ENSEMBL_GENE_ID','TPM_GENE'}.issubset(exp_data_sample['gene'].columns): + if {'ENSEMBL_GENE_ID','TPM_LOG2_GENE'}.issubset(exp_data_sample['gene'].columns): for col in exp_data_sample['gene'].columns: if col not in required_cols: exp_data_sample['gene'] = exp_data_sample['gene'].drop(col, axis = 1) exp_data_sample['gene'] = exp_data_sample['gene'].rename( - columns = {'TPM_GENE':sample_id}) + columns = {'TPM_LOG2_GENE':sample_id}) if 'ENSEMBL_GENE_ID' in exp_data_refcohort.columns and \ 'ENSEMBL_GENE_ID' in exp_data_sample['gene'].columns: ref_sample_mat = exp_data_refcohort.merge( exp_data_sample['gene'], on = 'ENSEMBL_GENE_ID', how = 'left') + ref_sample_mat = ref_sample_mat.set_index('ENSEMBL_GENE_ID') - percentiles = ref_sample_mat.rank(1, pct=True, numeric_only=True).apply(lambda x: round(x * 100)) + percentiles = ref_sample_mat.rank(1, pct=True, numeric_only=True).apply(lambda x: round(x * 100, 1)) sample_percentiles = percentiles[[sample_id]].reset_index().rename(columns={sample_id: 'PERCENTILE'}) quantiles = pd.concat([ round(ref_sample_mat[sample_id], ndigits = 5), @@ -455,10 +464,12 @@ def find_expression_outliers(sample_expression: dict, round(ref_sample_mat.std(1), ndigits = 9), ], axis=1).rename( columns={0.25: 'Q1', 0.5: 'Q2',0.75: 'Q3', - sample_id: 'TPM_GENE', 0: 'MEAN', 1: 'STD'}).reset_index() + sample_id: 'TPM_LOG2_GENE', 0: 'MEAN', 1: 'STD'}).reset_index() quantiles['IQR'] = round(quantiles.Q3 - quantiles.Q1, ndigits = 5) quantiles['Z_SCORE'] = round(quantiles[quantiles.STD > 0].apply( - lambda row: (row.TPM_GENE - row.MEAN) / row.STD, axis=1), ndigits = 5) + lambda row: (row.TPM_LOG2_GENE - row.MEAN) / row.STD, axis=1), ndigits = 5) + #ref_sample_mat = exp_data_refcohort.merge( + # exp_data_sample['gene'], on = 'ENSEMBL_GENE_ID', how = 'left') #quantiles['kIQR'] = round(quantiles[quantiles.IQR > 0].apply( # lambda row: (row.TPM_GENE - row.Q2) / row.IQR, axis=1), ndigits = 5) @@ -474,11 +485,17 @@ def find_expression_outliers(sample_expression: dict, 'REF_COHORT', 'REF_COHORT_SIZE', 'ENSEMBL_GENE_ID', - 'TPM_GENE', 'MEAN','STD', - 'Z_SCORE','Q1', 'Q2', - 'Q3', 'IQR', 'PERCENTILE']] + 'TPM_LOG2_GENE', + 'MEAN', + 'STD', + 'Z_SCORE', + 'Q1', + 'Q2', + 'Q3', + 'IQR', + 'PERCENTILE']] - mask_valid_ensembl = pd.notna(outlier_metrics['TPM_GENE']) + mask_valid_ensembl = pd.notna(outlier_metrics['TPM_LOG2_GENE']) outlier_metrics_valid = outlier_metrics[mask_valid_ensembl] @@ -493,7 +510,7 @@ def correlate_samples(exp_data_sample: dict, corr_similarity = pd.DataFrame() if not 'gene' in exp_data_sample: - logger.warn("No 'gene' entry in expression data dictionary for sample " + sample_id + " - skipping correlation analysis") + logger.warning("No 'gene' entry in expression data dictionary for sample " + sample_id + " - skipping correlation analysis") return(corr_similarity) if 'ENSEMBL_GENE_ID' in exp_data_refcohort.columns and \ 'ENSEMBL_GENE_ID' in exp_data_sample['gene'].columns: diff --git a/pcgr/main.py b/pcgr/main.py index 3134f058..168ba5b2 100755 --- a/pcgr/main.py +++ b/pcgr/main.py @@ -39,10 +39,9 @@ def cli(): optional_signatures = parser.add_argument_group("Mutational signature options") optional_cna = parser.add_argument_group("Somatic copy number alteration (CNA) data options") optional_rna = parser.add_argument_group("Bulk RNA-seq and RNA fusion data options") - #optional_germline = parser.add_argument_group("Germline variant options") + optional_germline = parser.add_argument_group("Germline variant options") optional_other = parser.add_argument_group("Other options") - required.add_argument("--input_vcf", dest="input_vcf", help="VCF input file with somatic variants in tumor sample, SNVs/InDels", required=True) required.add_argument("--vep_dir", dest="vep_dir", help="Directory of VEP cache, e.g. $HOME/.vep", required=True) required.add_argument("--refdata_dir", dest="refdata_dir", help="Directory where PCGR reference data bundle was downloaded and unpacked", required=True) @@ -57,7 +56,6 @@ def cli(): optional_sample.add_argument("--tumor_purity", type=float, dest="tumor_purity", help="Estimated tumor purity (between 0 and 1) (default: %(default)s)") optional_sample.add_argument("--tumor_ploidy", type=float, dest="tumor_ploidy", help="Estimated tumor ploidy (default: %(default)s)") - optional_allelic_support.add_argument("--tumor_dp_tag", dest="tumor_dp_tag", default="_NA_", help="Specify VCF INFO tag for sequencing depth (tumor, must be Type=Integer, default: %(default)s") optional_allelic_support.add_argument("--tumor_af_tag", dest="tumor_af_tag", default="_NA_", help="Specify VCF INFO tag for variant allelic fraction (tumor, must be Type=Float, default: %(default)s") optional_allelic_support.add_argument("--control_dp_tag", dest="control_dp_tag", default="_NA_", help="Specify VCF INFO tag for sequencing depth (control, must be Type=Integer, default: %(default)s") @@ -88,7 +86,7 @@ def cli(): optional_vep.add_argument("--vep_n_forks", default=4, type=int, help="Number of forks (VEP option '--fork'), default: %(default)s") optional_vep.add_argument("--vep_buffer_size", default=500, type=int, help=f"Variant buffer size (variants read into memory simultaneously, VEP option '--buffer_size')\n- set lower to reduce memory usage, default: %(default)s") - optional_vep.add_argument("--vep_pick_order", default="mane_select,mane_plus_clinical,canonical,appris,tsl,biotype,ccds,rank,length", help=f"Comma-separated string " + \ + optional_vep.add_argument("--vep_pick_order", default="mane_select,mane_plus_clinical,canonical,biotype,ccds,rank,tsl,appris,length", help=f"Comma-separated string " + \ "of ordered transcript/variant properties for selection of primary variant consequence\n(option '--pick_order' in VEP), default: %(default)s") optional_vep.add_argument("--vep_no_intergenic", action="store_true", help="Skip intergenic variants during variant annotation (VEP option '--no_intergenic' in VEP), default: %(default)s") optional_vep.add_argument("--vep_regulatory", action="store_true", help="Add VEP regulatory annotations (VEP option '--regulatory') or non-coding interpretation, default: %(default)s") @@ -111,13 +109,13 @@ def cli(): optional_cna.add_argument("--cna_overlap_pct", type=float, default=50, dest="cna_overlap_pct", help="Mean percent overlap between copy number segment and gene transcripts for reporting of gains/losses in tumor suppressor genes/oncogenes, (default: %(default)s)") #optional_rna.add_argument("--input_rna_fusion", dest = "input_rna_fusion", help = "File with RNA fusion transcripts detected in tumor (tab-separated values)") optional_rna.add_argument("--input_rna_expression", dest = "input_rna_exp", help = "File with bulk RNA expression counts (TPM) of transcripts in tumor (tab-separated values)") - optional_rna.add_argument('--expression_sim', action='store_true', help="Compare expression profile of tumor sample to known expression profiles (default: %(default)s)") + optional_rna.add_argument('--expression_sim', action='store_true', help="Compare expression profile of tumor sample to expression profiles of other tumor samples (default: %(default)s)") optional_rna.add_argument("--expression_sim_db", dest = "expression_sim_db", default="tcga,depmap,treehouse", help=f"Comma-separated string " + \ "of databases for used in RNA expression similarity analysis, default: %(default)s") - - #optional_germline.add_argument("--input_germline", dest="input_germline", help="CPSR-classified germline calls (file '.cpsr..classification.tsv.gz')") - #optional_germline.add_argument("--sample_id_germline", dest="sample_id_germline", help="Sample identifier for germline calls - used for verification of input_germline file") + optional_germline.add_argument("--input_cpsr", dest="input_cpsr", help="CPSR-classified germline calls (file '.cpsr..classification.tsv.gz')") + optional_germline.add_argument("--input_cpsr_yaml", dest="input_cpsr_yaml", help="CPSR YAML configuration file (file '.cpsr..conf.yaml')") + optional_germline.add_argument("--cpsr_ignore_vus", action="store_true", help="Do not show variants of uncertain significance (VUS) in the germline section of the HTML report (default: %(default)s)") optional_other.add_argument("--vcf2maf", action="store_true", help="Generate a MAF file for input VCF using https://github.com/mskcc/vcf2maf (default: %(default)s)") optional_other.add_argument("--vcfanno_n_proc", default=4, type=int, help="Number of vcfanno processes (option '-p' in vcfanno), default: %(default)s") @@ -127,6 +125,7 @@ def cli(): optional_other.add_argument("--force_overwrite", action="store_true", help="By default, the script will fail with an error if any output file already exists. You can force the overwrite of existing result files by using this flag, default: %(default)s") optional_other.add_argument("--version", action="version", version="%(prog)s " + str(pcgr_vars.PCGR_VERSION)) optional_other.add_argument("--no_reporting", action="store_true", help="Run functional variant annotation on VCF through VEP/vcfanno, omit other analyses (i.e. Tier assignment/MSI/TMB/Signatures etc. and report generation (STEP 4), default: %(default)s") + optional_other.add_argument("--no_html", action="store_true", help="Do not generate HTML report (default: %(default)s)") optional_other.add_argument("--debug", action="store_true", help="Print full commands to log") optional_other.add_argument("--pcgrr_conda", default="pcgrr", help="pcgrr conda env name (default: %(default)s)") @@ -144,7 +143,7 @@ def cli(): # Run PCGR workflow run_pcgr(input_data, output_data, conf_options) -def run_pcgr(input_data, output_data,conf_options): +def run_pcgr(input_data, output_data, conf_options): """ Main function to run the PCGR workflow """ @@ -173,7 +172,8 @@ def run_pcgr(input_data, output_data,conf_options): input_cna = 'None' input_rna_fusion = 'None' input_rna_expression = 'None' - input_germline_cpsr = 'None' + input_cpsr_calls = 'None' + input_cpsr_yaml = 'None' pon_vcf = 'None' pon_annotation = 0 variant_set = pd.DataFrame @@ -189,8 +189,10 @@ def run_pcgr(input_data, output_data,conf_options): input_rna_fusion = os.path.join(input_data['rna_fusion_dir'], input_data['rna_fusion_basename']) if input_data['rna_expression_basename'] != 'NA': input_rna_expression = os.path.join(input_data['rna_expression_dir'], input_data['rna_expression_basename']) - #if input_data['germline_basename'] != 'NA': - # input_germline_cpsr = os.path.join(input_data['germline_dir'], input_data['germline_basename']) + if input_data['germline_basename'] != 'NA': + input_cpsr_calls = os.path.join(input_data['germline_dir'], input_data['germline_basename']) + if input_data['germline_yaml_basename'] != 'NA': + input_cpsr_yaml = os.path.join(input_data['germline_yaml_dir'], input_data['germline_yaml_basename']) if input_data['pon_vcf_basename'] != 'NA': pon_vcf = os.path.join(input_data['pon_vcf_dir'], input_data['pon_vcf_basename']) @@ -233,9 +235,11 @@ def run_pcgr(input_data, output_data,conf_options): f'{input_cna} ' f'{input_rna_fusion} ' f'{input_rna_expression} ' + f'{input_cpsr_calls} ' f'{pon_vcf} ' f'{conf_options["assay_properties"]["vcf_tumor_only"]} ' f'{conf_options["sample_id"]} ' + f'{conf_options["genome_assembly"]} ' f'{conf_options["other"]["retained_vcf_info_tags"]} ' f'{conf_options["somatic_snv"]["allelic_support"]["tumor_dp_tag"]} ' f'{conf_options["somatic_snv"]["allelic_support"]["tumor_af_tag"]} ' @@ -284,9 +288,10 @@ def run_pcgr(input_data, output_data,conf_options): logger.info(f'RNA expression similarity analysis: {rnaseq_sim_analysis_set}') #logger.info(f'Include molecularly targeted clinical trials (beta): {clinical_trials_set}') - # PCGR|Generate YAML file - containing configuration options and paths to annotated molecular profile datasets - # - VCF/TSV files (SNVs/InDels) - # - TSV files (copy number aberrations) + # PCGR|Generate YAML file - containing configuration options and paths to first-pass annotation of molecular profile datasets + # - VCF/TSV files (somatic SNVs/InDels) + # - TSV file (germline SNVs/InDels - CPSR) + # - TSV files (somatic copy number aberrations) # - TSV files (TMB) # - TSV files (RNA expression) # - TSV files (RNA fusion) - COMING @@ -302,7 +307,12 @@ def run_pcgr(input_data, output_data,conf_options): if conf_options['somatic_snv']['tmb']['run'] == 1: conf_options['molecular_data']['fname_tmb_tsv'] = tmb_fname if not input_cna == 'None': - conf_options['molecular_data']['fname_cna_tsv'] = output_data['cna'] + conf_options['molecular_data']['fname_cna_gene_tsv'] = output_data['cna_gene'] + conf_options['molecular_data']['fname_cna_segment_tsv'] = output_data['cna_segment'] + if not input_cpsr_calls == 'None': + conf_options['molecular_data']['fname_germline_tsv'] = input_cpsr_calls + if not input_cpsr_yaml == 'None': + conf_options['molecular_data']['fname_germline_yaml'] = input_cpsr_yaml if not input_rna_expression == 'None': conf_options['molecular_data']['fname_expression_tsv'] = output_data['expression'] conf_options['molecular_data']['fname_expression_outliers_tsv'] = output_data['expression_outliers'] @@ -412,7 +422,7 @@ def run_pcgr(input_data, output_data,conf_options): pcgr_summarise_command = ( f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {vep_vcfanno_summarised_vcf} {pon_annotation} ' f'{yaml_data["conf"]["vep"]["vep_regulatory"]} {oncogenicity_annotation} ' - f'{yaml_data["conf"]["sample_properties"]["site2"]} {yaml_data["conf"]["vep"]["vep_pick_order"]} ' + f'{yaml_data["conf"]["sample_properties"]["site2"]} {yaml_data["genome_assembly"]} {yaml_data["conf"]["vep"]["vep_pick_order"]} ' f'{input_data["refdata_assembly_dir"]} --compress_output_vcf ' f'{"--debug" if debug else ""}' ) @@ -425,7 +435,7 @@ def run_pcgr(input_data, output_data,conf_options): logger.info(summarise_db_src_msg1) logger.info(summarise_db_src_msg2) - logger.info('Variant oncogenicity classification according to ClinGen/VICC recommendations (Horak et al., Genet Med, 2022)') + logger.info('Variant oncogenicity classification according to ClinGen/CGC/VICC standard operating procedures (Horak et al., Genet Med, 2022)') logger.info('Variant biomarker matching (CIViC, CGI) at multiple resolutions (genes, exons, amino acid positions, hgvsp/hgvsc, genomic)') logger.info('Tumor suppressor/oncogene annotations based on multiple sources (NCG, CGC, CancerMine)') check_subprocess(logger, pcgr_summarise_command, debug) @@ -483,7 +493,7 @@ def run_pcgr(input_data, output_data,conf_options): input_data["refdata_assembly_dir"], logger = logger) ## Write transcript-level expression data to TSV if 'transcript' in expression_data.keys(): - if not expression_data['transcript'] is None: + if not expression_data['transcript'] is None: expression_data['transcript'].fillna('.').to_csv( yaml_data['molecular_data']['fname_expression_tsv'], sep = "\t", compression = "gzip", index = False) @@ -492,12 +502,12 @@ def run_pcgr(input_data, output_data,conf_options): #exp_to_cons.fillna('.').to_csv( # yaml_data['molecular_data']['fname_csq_expression_tsv'], sep = "\t", # compression = "gzip", index = False) - else: - if 'gene' in expression_data.keys(): - if not expression_data['gene'] is None: - expression_data['gene'].fillna('.').to_csv( - yaml_data['molecular_data']['fname_expression_tsv'], sep = "\t", - compression = "gzip", index = False) + else: + if 'gene' in expression_data.keys(): + if not expression_data['gene'] is None: + expression_data['gene'].fillna('.').to_csv( + yaml_data['molecular_data']['fname_expression_tsv'], sep = "\t", + compression = "gzip", index = False) ## Merge expression data with somatic SNV/InDel variant set variant_set = integrate_variant_expression( @@ -570,6 +580,7 @@ def run_pcgr(input_data, output_data,conf_options): expression_data, yaml_data, input_data["refdata_assembly_dir"], + protein_coding_only = True, logger = logger ) if not expression_outliers.empty: @@ -609,7 +620,8 @@ def run_pcgr(input_data, output_data,conf_options): logger = getlogger("pcgr-annotate-cna-segments") logger.info('PCGR - STEP 5: Annotation of copy number segments - cytobands, overlapping transcripts, and biomarkers') cna_annotation = cna.annotate_cna_segments( - output_fname = output_data['cna'], + output_segment_gene_fname = output_data['cna_gene'], + output_segment_fname = output_data['cna_segment'], output_dir = output_data['dir'], cna_segment_file = input_cna, build = yaml_data['genome_assembly'], @@ -621,6 +633,9 @@ def run_pcgr(input_data, output_data,conf_options): logger = logger) if cna_annotation == 0: logger.info('Finished pcgr-annotate-cna-segments') + else: + yaml_data['molecular_data']['fname_cna_gene_tsv'] = "None" + yaml_data['molecular_data']['fname_cna_segment_tsv'] = "None" print('----') else: logger = getlogger("pcgr-annotate-cna-segments") diff --git a/pcgr/mutation_hotspot.py b/pcgr/mutation_hotspot.py index 9b706e82..4a05caa7 100644 --- a/pcgr/mutation_hotspot.py +++ b/pcgr/mutation_hotspot.py @@ -39,12 +39,11 @@ def load_mutation_hotspots(hotspots_fname: str, logger: Logger) -> Dict[str, Dic with gzip.open(hotspots_fname, mode='rt') as f: reader = csv.DictReader(f, delimiter='\t') - for row in reader: + for row in reader: gene = str(row['entrezgene']) hgvsp2 = row['hgvsp2'] codon = row['codon'] hgvsc = row['hgvsc'] - hotspots['mutation'][gene + '-' + hgvsp2] = row hotspots['codon'][gene + '-' + codon] = row if hgvsc != '.': @@ -122,10 +121,12 @@ def match_csq_mutation_hotspot(transcript_csq_elements, cancer_hotspots, rec, pr hgvsp_candidate = 'p.' + str(gene_mutation_key.split('|')[3]) + str(gene_mutation_key.split('|')[4]) if hgvsp_candidate == principal_hgvsp: rec.INFO['MUTATION_HOTSPOT_MATCH'] = 'by_hgvsp_principal' + + ## mutation hotspot at splice site else: rec.INFO['MUTATION_HOTSPOT_MATCH'] = 'by_hgvsc_nonprincipal' - hgvsc_candidate = re.sub(r'>(A|G|C|T){1,}$', '' , str(gene_mutation_key.split('|')[4])) - if hgvsc_candidate == principal_hgvsc: + hgvsc_candidate = str(hotspot_info.split('|')[4]) + if hgvsc_candidate == re.sub(r'>(A|G|C|T){1,}$', '' ,principal_hgvsc): rec.INFO['MUTATION_HOTSPOT_MATCH'] = 'by_hgvsc_principal' else: ## multiple hotspot matches for alternative hgvsp keys @@ -138,10 +139,9 @@ def match_csq_mutation_hotspot(transcript_csq_elements, cancer_hotspots, rec, pr rec.INFO['MUTATION_HOTSPOT'] = hotspot_info rec.INFO['MUTATION_HOTSPOT_CANCERTYPE'] = unique_hotspot_mutations[hotspot_info] rec.INFO['MUTATION_HOTSPOT_MATCH'] = 'by_hgvsp_principal' - else: - hgvsc_candidate = re.sub(r'>(A|G|C|T){1,}$', '' , str(hotspot_info.split('|')[4])) - - if hgvsc_candidate == principal_hgvsc: + else: + hgvsc_candidate = str(hotspot_info.split('|')[4]) + if hgvsc_candidate == re.sub(r'>(A|G|C|T){1,}$', '' ,principal_hgvsc): rec.INFO['MUTATION_HOTSPOT'] = hotspot_info rec.INFO['MUTATION_HOTSPOT_CANCERTYPE'] = unique_hotspot_mutations[hotspot_info] rec.INFO['MUTATION_HOTSPOT_MATCH'] = 'by_hgvsc_principal' @@ -152,9 +152,7 @@ def match_csq_mutation_hotspot(transcript_csq_elements, cancer_hotspots, rec, pr for gene_codon_key in unique_hotspot_codons.keys(): if '|' in gene_codon_key: - codon = str(gene_codon_key.split('|')[3]) - if codon == principal_codon: rec.INFO['MUTATION_HOTSPOT'] = gene_codon_key rec.INFO['MUTATION_HOTSPOT_CANCERTYPE'] = unique_hotspot_codons[gene_codon_key] diff --git a/pcgr/oncogenicity.py b/pcgr/oncogenicity.py index cf029293..bab64028 100644 --- a/pcgr/oncogenicity.py +++ b/pcgr/oncogenicity.py @@ -2,14 +2,22 @@ import os,re,sys from cyvcf2 import VCF, Writer +import csv -def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): +from typing import Dict +from logging import Logger +import gzip + +from pcgr.annoutils import threeToOneAA + +def assign_oncogenicity_evidence(rec = None, oncogenic_variants = None, tumortype = "Any"): clingen_vicc_ev_codes = [ "CLINGEN_VICC_SBVS1", "CLINGEN_VICC_SBS1", "CLINGEN_VICC_SBP1", "CLINGEN_VICC_SBP2", + "CLINGEN_VICC_OS1", "CLINGEN_VICC_OS3", "CLINGEN_VICC_OM1", "CLINGEN_VICC_OM2", @@ -114,6 +122,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): "Consequence", "MUTATION_HOTSPOT", "MUTATION_HOTSPOT_CANCERTYPE", + "CLINVAR_KNOWN_ONCOGENIC", "SYMBOL", "BIOMARKER_MATCH", "ONCOGENE", @@ -162,6 +171,10 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): for code in clingen_vicc_ev_codes: variant_data[code] = False + if "CLINVAR_KNOWN_ONCOGENIC" in variant_data.keys(): + if not variant_data['CLINVAR_KNOWN_ONCOGENIC'] is None: + variant_data['CLINGEN_VICC_OS1'] = True + dbnsfp_minimum_majority = 6 dbnsfp_maximum_minority = 2 dbnsfp_minimum_algos_called = dbnsfp_minimum_majority @@ -260,8 +273,12 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): ## Split all biomarker evidence into a list biomarker_evidence = variant_data['BIOMARKER_MATCH'].split(',') + for eitem in biomarker_evidence: + + #print(variant_data['SYMBOL'] + '\t' + variant_data['Consequence'] + '\t' + str(eitem)) + ## Example 'eitem' element: ## cgi|659|CGI1077:Pancreas:Sensitivity/Response:C:Predictive:Somatic|by_hgvsp_principal if ('Predictive' in eitem or 'Oncogenic' in eitem) and \ @@ -272,7 +289,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): 'by_codon_principal' in eitem or \ 'by_aa_region_principal' in eitem): ## only applicable if OS3 is not set - if variant_data['CLINGEN_VICC_OS3'] is False: + if variant_data['CLINGEN_VICC_OS3'] is False and variant_data['CLINGEN_VICC_OS1'] is False: variant_data['CLINGEN_VICC_OM1'] = True ## Catch prognostic/diagnostic non-coding variants (e.g. TERT) - these will rank at the top @@ -283,7 +300,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): ('by_genomic_coord' in eitem or \ 'by_hgvsc_principal' in eitem): ## only applicable if OS3 is not set - if variant_data['CLINGEN_VICC_OS3'] is False: + if variant_data['CLINGEN_VICC_OS3'] is False and variant_data['CLINGEN_VICC_OS1'] is False: variant_data['CLINGEN_VICC_OM1'] = True if "gnomADe_EAS_AF" in variant_data.keys() and \ @@ -319,7 +336,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): if approx_zero_pop_freq == 5: variant_data["CLINGEN_VICC_OP4"] = True - + ## check if variant is a loss-of-function variant in a tumor suppressor gene (Cancer Gene Census/CancerMine) if "TSG" in variant_data.keys() and \ "ONCOGENE" in variant_data.keys() and \ @@ -360,6 +377,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): 'CLINGEN_VICC_SBP1', 'CLINGEN_VICC_SBP2', 'CLINGEN_VICC_OVS1', + 'CLINGEN_VICC_OS1', 'CLINGEN_VICC_OS3', 'CLINGEN_VICC_OM1', 'CLINGEN_VICC_OM2', @@ -379,6 +397,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): 'funcvar', 'funcvar', 'funcvar', + 'funcvar', 'funccomp'] og_score_data['pole'] = \ ['B','B', @@ -386,7 +405,8 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): 'B','P', 'P','P', 'P','P', - 'P','P'] + 'P','P', + 'P'] og_score_data['description'] = \ ['Very high MAF (> 0.05 in gnomAD - any five major continental pops)', @@ -395,6 +415,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): 'Multiple lines (>=6) of computational evidence support a benign effect on the gene or gene product - from dbNSFP', 'Silent and intronic changes outside of the consensus splice site', 'Null variant - predicted as LoF - in bona fide tumor suppressor gene', + 'Same amino acid change as previously established oncogenic variant - regardless of nucleotide change (ClinVar oncogenicity records)', 'Located in a mutation hotspot (cancerhotspots.org). >= 50 samples with a variant at AA position, >= 10 samples with same AA change', 'Presumably critical site of functional domain - based on indirect evidence from overlap with predictive biomarkers', 'Protein length changes from in-frame dels/ins in known oncogene/tumor suppressor genes or stop-loss variants in a tumor suppressor gene', @@ -404,7 +425,7 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): ] og_score_data['score'] = \ - [-8, -4, 1, -1, -1, 8, 4, 2, 2, 2, 1, 1] + [-8, -4, 1, -1, -1, 8, 4, 4, 2, 2, 2, 1, 1] i = 0 oncogenicity_scores = {} @@ -473,3 +494,95 @@ def assign_oncogenicity_evidence(rec = None, tumortype = "Any"): rec.INFO[e] = variant_data[e] return(rec) + +def load_oncogenic_variants(oncogenic_variants_fname: str, logger: Logger): + """ + Load oncogenic variants from a file and create a dictionary of variants. + """ + + oncogenic_variants = {} + if not os.path.exists(oncogenic_variants_fname): + logger.info(f"ERROR: File '{oncogenic_variants_fname}' does not exist - exiting") + exit(1) + + with gzip.open(oncogenic_variants_fname, mode='rt') as f: + reader = csv.DictReader(f, delimiter='\t') + for row in reader: + gene = str(row['entrezgene']) + oncogenic_variants[str(gene) + '-' + str(row['var_id'])] = row + if not len(row['hgvsp']) == 0: + oncogenic_variants[str(gene) + '-' + str(row['hgvsp'])] = row + if not len(row['hgvs_c']) == 0: + oncogenic_variants[str(gene) + '-' + str(row['hgvs_c'])] = row + + return oncogenic_variants + + +def match_oncogenic_variants(transcript_csq_elements, oncogenic_variants, rec, principal_csq_properties): + + """ + Function that matches consequence entries from VEP (transcript_csq_elements) agains known oncogenic variants from ClinVar, + using both genomic coordinate information, HGVSp and HGVSc information. + """ + + principal_hgvsp = principal_csq_properties['hgvsp'] + principal_hgvsc = principal_csq_properties['hgvsc'] + + known_oncogenic_matches = {} + + for csq in transcript_csq_elements: + (consequence, symbol, entrezgene, hgvsc, hgvsp, exon, feature_type, feature, biotype) = csq.split(':') + + if not bool(re.search(r'^(missense|stop|start|inframe|splice_donor|intron|splice_acceptor|frameshift|upstream)', consequence)) is True: + continue + + var_id = str(rec.CHROM) + '_' + str(rec.POS) + '_' + str(rec.REF) + '_' + str(','.join(rec.ALT)) + oncogenic_varkey = '.' + if entrezgene != ".": + oncogenic_varkey = str(entrezgene) + '-' + str(var_id) + if oncogenic_varkey in oncogenic_variants and 'oncogenicity' in oncogenic_variants[oncogenic_varkey]: + oncogenic_info = oncogenic_variants[oncogenic_varkey]['symbol'] + '|' + \ + str(oncogenic_variants[oncogenic_varkey]['hgvsp']) + '|' + \ + str(oncogenic_variants[oncogenic_varkey]['hgvs_c']) + '|' + \ + str(oncogenic_variants[oncogenic_varkey]['oncogenicity']) + if not oncogenic_info in known_oncogenic_matches: + known_oncogenic_matches[oncogenic_info] = [] + known_oncogenic_matches[oncogenic_info].append('by_genomic_coord') + + if entrezgene != "." and hgvsp != ".": + hgvsp_short = threeToOneAA(hgvsp) + oncogenic_varkey = str(entrezgene) + '-' + str(hgvsp_short) + if oncogenic_varkey in oncogenic_variants and 'oncogenicity' in oncogenic_variants[oncogenic_varkey]: + oncogenic_info = oncogenic_variants[oncogenic_varkey]['symbol'] + '|' + \ + str(oncogenic_variants[oncogenic_varkey]['hgvsp']) + '|' + \ + str(oncogenic_variants[oncogenic_varkey]['hgvs_c']) + '|' + \ + str(oncogenic_variants[oncogenic_varkey]['oncogenicity']) + if not oncogenic_info in known_oncogenic_matches: + known_oncogenic_matches[oncogenic_info] = [] + if hgvsp_short == principal_hgvsp: + known_oncogenic_matches[oncogenic_info].append('by_hgvsp_principal') + else: + known_oncogenic_matches[oncogenic_info].append('by_hgvsp_nonprincipal') + + if entrezgene != "." and hgvsc != ".": + oncogenic_varkey = str(entrezgene) + '-' + str(hgvsc) + if oncogenic_varkey in oncogenic_variants and 'oncogenicity' in oncogenic_variants[oncogenic_varkey]: + oncogenic_info = oncogenic_variants[oncogenic_varkey]['symbol'] + '|' + \ + str(oncogenic_variants[oncogenic_varkey]['hgvsp']) + '|' + \ + str(oncogenic_variants[oncogenic_varkey]['hgvs_c']) + '|' + \ + str(oncogenic_variants[oncogenic_varkey]['oncogenicity']) + if not oncogenic_info in known_oncogenic_matches: + known_oncogenic_matches[oncogenic_info] = [] + if hgvsc == principal_hgvsc: + known_oncogenic_matches[oncogenic_info].append('by_hgvsc_principal') + else: + known_oncogenic_matches[oncogenic_info].append('by_hgvsc_nonprincipal') + if oncogenic_varkey == ".": + continue + + if len(list(known_oncogenic_matches.keys())) == 1: + oncogenic_variant_info = list(known_oncogenic_matches.keys())[0] + rec.INFO['CLINVAR_KNOWN_ONCOGENIC'] = oncogenic_variant_info + '|' + \ + '&'.join(sorted(set(known_oncogenic_matches[oncogenic_variant_info]))) + + return \ No newline at end of file diff --git a/pcgr/pcgr_vars.py b/pcgr/pcgr_vars.py index 9724bb9d..3c5fc7f2 100644 --- a/pcgr/pcgr_vars.py +++ b/pcgr/pcgr_vars.py @@ -4,7 +4,7 @@ ## Version - software and bundle PCGR_VERSION = __version__ -DB_VERSION = '20240621' +DB_VERSION = '20240927' ## Miscellaneous settings NCBI_BUILD_MAF = 'GRCh38' @@ -29,7 +29,7 @@ VEP_MAX_FORKS = 8 VEP_MIN_BUFFER_SIZE = 50 VEP_MAX_BUFFER_SIZE = 30000 -VEP_PICK_CRITERIA = ['mane_select','mane_plus_clinical','canonical','appris','tsl','biotype','ccds','rank','length'] +VEP_PICK_CRITERIA = ['mane_select','mane_plus_clinical','canonical','biotype','ccds','rank','tsl','appris','length'] ## Gene expression comparative analysis resources EXPRESSION_DB_SOURCES = ['tcga','depmap','treehouse'] @@ -38,6 +38,45 @@ SAMPLE_ID_MAX_LENGTH = 40 SAMPLE_ID_MIN_LENGTH = 3 +## Classified germline variant input (from CPSR) - required columns +germline_input_required_cols = [ + 'SAMPLE_ID', + 'VAR_ID', + 'GENOMIC_CHANGE', + 'VARIANT_CLASS', + 'GENOTYPE', + 'ALTERATION', + 'DP_CONTROL', + 'CPSR_CLASSIFICATION_SOURCE', + 'GENENAME', + 'ENTREZGENE', + 'ENSEMBL_GENE_ID', + 'ENSEMBL_TRANSCRIPT_ID', + 'HGVSc', + 'HGVSc_RefSeq', + 'HGVSp', + 'CONSEQUENCE', + 'CDS_CHANGE', + 'SYMBOL', + 'CODING_STATUS', + 'PFAM_DOMAIN', + 'PFAM_DOMAIN_NAME', + 'PROTEIN_CHANGE', + 'LOSS_OF_FUNCTION', + 'NULL_VARIANT', + 'DBSNP_RSID', + 'CLINVAR_MSID', + 'CLINVAR_CLASSIFICATION', + 'CLINVAR_VARIANT_ORIGIN', + 'CLINVAR_PHENOTYPE', + 'CLINVAR_CONFLICTED', + 'CLINVAR_REVIEW_STATUS_STARS', + 'CPSR_CLASSIFICATION', + 'CPSR_PATHOGENICITY_SCORE', + 'CPSR_CLASSIFICATION_CODE', + 'FINAL_CLASSIFICATION' +] + ## Primary tumor sites - PCGR tsites = { 0: 'Any', @@ -177,6 +216,10 @@ r"^(stop_(lost|gained)|start_lost|frameshift_|missense_|splice_(donor|acceptor)|protein_altering|inframe_)" CSQ_CODING_SILENT_PATTERN = \ r"^(stop_(lost|gained)|start_lost|frameshift_|missense_|splice_(donor|acceptor)|protein_altering|inframe_|synonymous|(start|stop)_retained)" +CSQ_CODING_PATTERN2 = \ + r"(stop_(lost|gained)|start_lost|frameshift_|missense_|splice_(donor|acceptor)|protein_altering|inframe_)" +CSQ_CODING_SILENT_PATTERN2 = \ + r"(stop_(lost|gained)|start_lost|frameshift_|missense_|splice_(donor|acceptor)|protein_altering|inframe_|synonymous|(start|stop)_retained)" CSQ_NULL_PATTERN = r"^(stop_gained|frameshift_)" CSQ_SPLICE_REGION_PATTERN = r"(splice_|intron_variant)" CSQ_SPLICE_DONOR_PATTERN = \ diff --git a/pcgr/variant.py b/pcgr/variant.py index 1c3ec15b..fe76cd27 100644 --- a/pcgr/variant.py +++ b/pcgr/variant.py @@ -56,10 +56,14 @@ def append_annotations(vcf2tsv_gz_fname: str, refdata_assembly_dir: str, logger) if os.path.exists(clinvar_tsv_fname): clinvar_data_df = pd.read_csv( clinvar_tsv_fname, sep="\t", - usecols=["variation_id","origin_simple","VAR_ID","trait"], + usecols=["variation_id","origin_simple","VAR_ID","trait","classification","conflicted"], low_memory = False) - clinvar_data_df['CLINVAR_TRAITS_ALL'] = clinvar_data_df['origin_simple'].str.capitalize().str.cat( - clinvar_data_df['trait'], sep = " - ") + clinvar_data_df['classification'] = clinvar_data_df['classification'].str.replace("_", " ", regex = True) + clinvar_data_df.loc[(clinvar_data_df['classification'] == "VUS") & (clinvar_data_df['conflicted'] == 1),"classification"] = \ + "VUS/Conflicting evidence" + clinvar_data_df['CLINVAR_TRAITS_ALL'] = clinvar_data_df['classification'].str.cat( + clinvar_data_df['origin_simple'].str.capitalize().str.cat( + clinvar_data_df['trait'], sep = " - "), sep = " - ") clinvar_data_df['CLINVAR_MSID'] = clinvar_data_df['variation_id'] clinvar_data_df = clinvar_data_df.astype({'CLINVAR_MSID':'string'}) clinvar_data_df['CLINVAR_MSID'] = clinvar_data_df['CLINVAR_MSID'].str.replace("\\.[0-9]{1,}$", "", regex = True) @@ -275,4 +279,24 @@ def clean_annotations(variant_set: pd.DataFrame, yaml_data: dict, logger) -> pd. variant_set[vcf_info_tag] = variant_set[vcf_info_tag].apply(lambda x: str(int(x)) ) variant_set.loc[variant_set[vcf_info_tag] == "-123456789", vcf_info_tag] = np.nan - return variant_set \ No newline at end of file + return variant_set + +def reverse_complement_dna(dna_string = "C"): + pairs = { + "A":"T", + "C":"G", + "G":"C", + "T":"A", + } + reverse_complement = "" + i = len(dna_string) - 1 + while i >= 0: + base = str(dna_string[i]).upper() + if base in pairs: + complement = pairs[base] + else: + complement = base + reverse_complement += complement + i = i - 1 + return reverse_complement + \ No newline at end of file diff --git a/pcgr/vep.py b/pcgr/vep.py index 0d93f281..f1fa5744 100644 --- a/pcgr/vep.py +++ b/pcgr/vep.py @@ -152,7 +152,8 @@ def get_csq_record_annotations(csq_fields, varkey, logger, vep_csq_fields_map, t def pick_single_gene_csq(vep_csq_results, pick_criteria_ordered = "mane_select,mane_plus_clinical,canonical,appris,tsl,biotype,ccds,rank,length", - logger = None): + logger = None, + debug = 0): csq_candidates = [] @@ -166,7 +167,7 @@ def pick_single_gene_csq(vep_csq_results, csq_candidate['mane_select'] = 1 csq_candidate['mane_plus_clinical'] = 1 csq_candidate['canonical'] = 1 - csq_candidate['appris'] = 8 + csq_candidate['appris'] = 18 csq_candidate['biotype'] = 1 csq_candidate['tsl'] = 6 csq_candidate['ccds'] = 1 @@ -175,6 +176,7 @@ def pick_single_gene_csq(vep_csq_results, ## set to picked as default csq_candidate['PICKED'] = True csq_candidate['varkey'] = csq_elem['VARKEY'] + csq_candidate['conskey'] = str(csq_elem['SYMBOL']) + ':' + str(csq_elem['Consequence']) ## MANE select status - lower value prioritized if not csq_elem['MANE_SELECT'] is None: @@ -185,16 +187,16 @@ def pick_single_gene_csq(vep_csq_results, csq_candidate['mane_plus_clinical'] = 0 ## CANONICAL status - lower value prioritized - if not csq_elem['CANONICAL'] is None: - if csq_elem['CANONICAL'] is True: + if not csq_elem['CANONICAL'] is None: + if csq_elem['CANONICAL'] == 'YES': csq_candidate['canonical'] = 0 ## APPRIS level - lower value prioritized - if not csq_elem['APPRIS'] is None: + if not csq_elem['APPRIS'] is None: if not 'ALTERNATIVE' in csq_elem['APPRIS']: csq_candidate['appris'] = int(re.sub(r'[A-Z]{1,}:?', '', csq_elem['APPRIS'])) else: - csq_candidate['appris'] = int(re.sub(r'ALTERNATIVE:','', csq_elem['APPRIS'])) + 5 + csq_candidate['appris'] = int(re.sub(r'ALTERNATIVE:','', csq_elem['APPRIS'])) + 10 ## Biotype - lower value prioritized if not csq_elem['BIOTYPE'] is None: @@ -216,11 +218,24 @@ def pick_single_gene_csq(vep_csq_results, logger.warn(warn_msg) ## TSL - lower value prioritized - if not csq_elem['TSL'] is None: + if not csq_elem['TSL'] is None: csq_candidate['tsl'] = int(csq_elem['TSL']) csq_candidates.append(csq_candidate) + # if debug: + # print() + # for c in csq_candidates: + # all_rank_criterions = [] + # all_rank_criterions.append('PICKED:' + str(c['PICKED'])) + # all_rank_criterions.append(c['varkey']) + # all_rank_criterions.append(c['conskey']) + # for rank_criterion in pick_criteria_ordered.split(','): + # if rank_criterion in c: + # all_rank_criterions.append(rank_criterion + ':' + str(c[rank_criterion])) + # rank_str = ' - '.join(map(str, all_rank_criterions)) + # print(rank_str) + # print() ## Go through pick criteria in pre-defined order ## - set 'PICKED' = False for all csq elements with a score above the minimum value for a given criterion @@ -248,6 +263,20 @@ def pick_single_gene_csq(vep_csq_results, j = j + 1 if num_picked == 1: + # if debug: + # print() + # for c in csq_candidates: + # all_rank_criterions = [] + # all_rank_criterions.append('PICKED:' + str(c['PICKED'])) + # all_rank_criterions.append(c['varkey']) + # all_rank_criterions.append(c['conskey']) + # for rank_criterion in pick_criteria_ordered.split(','): + # if rank_criterion in c: + # all_rank_criterions.append(rank_criterion + ':' + str(c[rank_criterion])) + + # rank_str = ' - '.join(map(str, all_rank_criterions)) + # print(rank_str) + # print() break return(chosen_csq_index) @@ -340,6 +369,12 @@ def parse_vep_csq(rec, transcript_xref_map, vep_csq_fields_map, vep_pick_order, symbol = str(csq_fields[vep_csq_fields_map['field2index']['SYMBOL']]) if csq_fields[vep_csq_fields_map['field2index']['HGVSc']] != "": hgvsc = str(csq_fields[vep_csq_fields_map['field2index']['HGVSc']].split(':')[1]) + else: + if len(all_csq_pick) == 1: + if 'HGVSc' in all_csq_pick[0]: + if not all_csq_pick[0]['HGVSc'] is None: + if ':' in all_csq_pick[0]['HGVSc']: + hgvsc = str(all_csq_pick[0]['HGVSc'].split(':')[1]) if csq_fields[vep_csq_fields_map['field2index']['HGVSp']] != "": hgvsp = str(csq_fields[vep_csq_fields_map['field2index']['HGVSp']].split(':')[1]) consequence_entry = (str(csq_fields[vep_csq_fields_map['field2index']['Consequence']]) + ":" + @@ -378,7 +413,7 @@ def parse_vep_csq(rec, transcript_xref_map, vep_csq_fields_map, vep_pick_order, ## If multiple transcript-specific variant consequences highlighted by --pick_allele_gene, ## prioritize/choose block of consequence according to 'vep_pick_order' if len(vep_csq_results['picked_gene_csq']) > 1: - vep_chosen_csq_idx = pick_single_gene_csq(vep_csq_results, pick_criteria_ordered = vep_pick_order, logger = logger) + vep_chosen_csq_idx = pick_single_gene_csq(vep_csq_results, pick_criteria_ordered = vep_pick_order, logger = logger, debug = debug) vep_csq_results['picked_csq'] = vep_csq_results['picked_gene_csq'][vep_chosen_csq_idx] else: ## check that size if 1, otherwise prompt error below diff --git a/pcgrr/NAMESPACE b/pcgrr/NAMESPACE index 5381731a..1b26c969 100644 --- a/pcgrr/NAMESPACE +++ b/pcgrr/NAMESPACE @@ -46,6 +46,7 @@ export(get_excel_sheets) export(get_genome_obj) export(get_prevalent_site_signatures) export(get_valid_chromosomes) +export(get_variant_statistics) export(het_af_germline_status) export(hom_af_status) export(init_cna_vstats) @@ -63,6 +64,7 @@ export(init_var_content) export(kataegis_detect) export(kataegis_input) export(load_all_eitems) +export(load_cpsr_classified_variants) export(load_dna_variants) export(load_eitems) export(load_expression_csq) @@ -82,6 +84,7 @@ export(mkdir) export(msi_indel_fraction_plot) export(msi_indel_load_plot) export(order_variants) +export(plot_cna_segments) export(plot_tmb_primary_site_tcga) export(plot_value_boxes) export(pon_status) diff --git a/pcgrr/R/cna.R b/pcgrr/R/cna.R new file mode 100644 index 00000000..f13f9dc2 --- /dev/null +++ b/pcgrr/R/cna.R @@ -0,0 +1,309 @@ +#' Plot allele-specific copy number segments +#' +#' Function that plots allele-specific copy number segments +#' (minor + total allele copies) +#' +#' @param chrom_coordinates data frame with assembly-specific chromosome coordinate data (length etc) +#' @param cna_segment data frame with annotated copy number segments +#' @param cna_gene data frame with gene-level copy number data +#' +#' @export +#' +plot_cna_segments <- function(chrom_coordinates = NULL, + cna_segment = NULL, + cna_gene = NULL){ + + ## Validate input + invisible(assertthat::assert_that( + !is.null(chrom_coordinates), + !is.null(cna_segment), + !is.null(cna_gene) + )) + + invisible(assertthat::assert_that( + base::is.data.frame(cna_segment), + base::is.data.frame(cna_gene), + base::is.data.frame(chrom_coordinates) + )) + + ## Check required column names + assertable::assert_colnames( + chrom_coordinates, + c("chrom", + "genome_start", + "genome_end", + "length", + "centromere_left", + "centromere_right"), + only_colnames = F, + quiet = T + ) + + ## Add centromere and midpoint coordinates to chrom_coordinates + reference_coordinates <- chrom_coordinates |> + dplyr::mutate(chrom = paste0("chr", .data$chrom)) |> + dplyr::mutate( + centromere_genome_start = + .data$genome_start + .data$centromere_left, + centromere_genome_end = + .data$genome_start + .data$centromere_right, + midpoint = round( + (.data$genome_start + .data$genome_end) / 2)) + + + ## Check required column names of cna_segments data frame + assertable::assert_colnames( + cna_segment, + c("CHROM", + "SEGMENT_START", + "SEGMENT_END", + "CN_MAJOR", + "CN_MINOR", + "CN_TOTAL", + "EVENT_TYPE"), + only_colnames = F, + quiet = T + ) + + assertable::assert_colnames( + cna_gene, + c("CHROM", + "SEGMENT_START", + "SEGMENT_END", + "CN_MAJOR", + "CN_MINOR", + "CN_TOTAL", + "ONCOGENE", + "ONCOGENE_RANK", + "TUMOR_SUPPRESSOR", + "TUMOR_SUPPRESSOR_RANK", + "SYMBOL", + "VARIANT_CLASS"), + only_colnames = F, + quiet = T + ) + + + ## Identify segments that involve oncogene gain or tumor suppressor loss + onc_gain_tsg_loss <- cna_gene |> + dplyr::select( + c("CHROM", "SEGMENT_START", "SEGMENT_END", + "VARIANT_CLASS","ONCOGENE", "ONCOGENE_RANK", + "TUMOR_SUPPRESSOR","TUMOR_SUPPRESSOR_RANK", "SYMBOL")) |> + dplyr::mutate(CHROM = paste0("chr", .data$CHROM)) |> + dplyr::filter( + (.data$ONCOGENE == TRUE & + .data$VARIANT_CLASS == "gain") | + (.data$TUMOR_SUPPRESSOR == TRUE & + .data$VARIANT_CLASS == "homdel")) + + tsg_loss <- data.frame() + onc_gain <- data.frame() + + ## If there are oncogene gains or tumor suppressor losses, prepare data for plotting + if(NROW(onc_gain_tsg_loss) > 0){ + onc_gain <- onc_gain_tsg_loss |> + dplyr::filter( + .data$ONCOGENE == TRUE & + .data$VARIANT_CLASS == "gain") + + ## For now, if multiple oncogenes are involved in an amplified segment, we will only + ## show the top three in the plot (hover) + if(NROW(onc_gain) > 0){ + onc_gain <- onc_gain |> + dplyr::arrange(dplyr::desc(.data$ONCOGENE_RANK)) |> + dplyr::group_by( + .data$CHROM, + .data$SEGMENT_START, + .data$SEGMENT_END) |> + dplyr::summarise( + ONC_GAIN = paste( + utils::head(.data$SYMBOL, 3), collapse = ", "), + .groups = "drop") + } + + tsg_loss <- onc_gain_tsg_loss |> + dplyr::filter( + .data$TUMOR_SUPPRESSOR == TRUE & + .data$VARIANT_CLASS == "homdel") + + ## For now, if multiple TSGs are involved in a lost segment, we will only + ## show the top three in the plot (hover) + if(NROW(tsg_loss) > 0){ + tsg_loss <- tsg_loss |> + dplyr::arrange( + dplyr::desc(.data$TUMOR_SUPPRESSOR_RANK)) |> + dplyr::group_by( + .data$CHROM, + .data$SEGMENT_START, + .data$SEGMENT_END) |> + dplyr::summarise( + TSG_LOSS = paste( + utils::head(.data$SYMBOL, 3), collapse = ", "), + .groups = "drop") + } + + } + + ## Prepare data for plotting + ## - pull out core segment elements - ignoring gene/transcript annotations + ## - add segment size in Mb/Kb + ## - add segment start and end positions in genome coordinates + cna_segments_global <- cna_segment |> + dplyr::select( + c("CHROM", "SEGMENT_START", "SEGMENT_END", + "CYTOBAND","EVENT_TYPE","CN_MINOR", + "CN_TOTAL","EVENT_TYPE")) |> + dplyr::mutate(CHROM = paste0("chr", .data$CHROM)) |> + dplyr::left_join( + dplyr::select( + reference_coordinates, + c("chrom", "genome_start","genome_end")), + by = c("CHROM" = "chrom"), + ) |> + dplyr::mutate( + segsize = round((.data$SEGMENT_END - + .data$SEGMENT_START) / 1000000, + digits = 3)) |> + dplyr::mutate(segsize = dplyr::if_else( + .data$segsize >= 1, + paste0(round(.data$segsize, digits = 1), "Mb"), + paste0(.data$segsize * 1000, "Kb"))) |> + dplyr::mutate( + SegmentStart = .data$genome_start + .data$SEGMENT_START, + SegmentEnd = .data$genome_start + .data$SEGMENT_END, + SegmentInfo = paste0(paste( + .data$CHROM, paste( + scales::comma(.data$SEGMENT_START), + scales::comma(.data$SEGMENT_END), + sep = "-"), + sep = ":"), " (",.data$segsize,")
- ", + .data$CYTOBAND, " (", .data$EVENT_TYPE,")")) |> + dplyr::select( + -c("genome_start","EVENT_TYPE","segsize")) |> + dplyr::distinct() + + + ## Add information about lost tumor suppressors and gained oncogenes + ## to SegmentInfo column + if(NROW(tsg_loss) > 0){ + cna_segments_global <- cna_segments_global |> + dplyr::left_join( + tsg_loss, + by = c("CHROM", "SEGMENT_START", "SEGMENT_END") + ) |> + dplyr::mutate(SegmentInfo = dplyr::if_else( + !is.na(.data$TSG_LOSS), + paste0( + .data$SegmentInfo, + "
- Tumor suppressor loss: ", + .data$TSG_LOSS), + .data$SegmentInfo)) + }else{ + cna_segments_global$TSG_LOSS <- as.character(NA) + } + + if(NROW(onc_gain) > 0){ + cna_segments_global <- cna_segments_global |> + dplyr::left_join( + onc_gain, + by = c("CHROM", "SEGMENT_START", "SEGMENT_END") + ) |> + dplyr::mutate(SegmentInfo = dplyr::if_else( + !is.na(.data$ONC_GAIN), + paste0(.data$SegmentInfo, + "
- Oncogene gain: ", + .data$ONC_GAIN), + .data$SegmentInfo)) + }else{ + cna_segments_global$ONC_GAIN <- as.character(NA) + } + + + low = min(reference_coordinates$genome_start) + upp = max(reference_coordinates$genome_end) + y_max <- max(unique(cna_segments_global$CN_TOTAL)) + + y_max_display <- y_max + 1 + if(y_max %% 2 == 0){ + y_max_display <- y_max + 2 + } + if(y_max > 15){ + y_max_display <- ceiling(y_max/5)*5 + } + + y_axis_interval <- 1 + y_breaks <- seq(0, y_max_display, by = y_axis_interval) + if(y_max > 5 & y_max <= 10){ + y_axis_interval <- 2 + y_breaks <- c(0,1, seq(2, y_max_display, by = y_axis_interval)) + }else{ + if(y_max > 10 & y_max <= 15){ + y_axis_interval <- 2 + y_breaks <- c( + 0,1, seq(2, y_max_display, by = y_axis_interval)) + }else{ + if(y_max > 15 & y_max <= 30){ + y_axis_interval <- 5 + y_breaks <- c( + 0,1,2,4,seq(5, y_max_display, by = y_axis_interval)) + }else{ + if(y_max > 30){ + y_axis_interval <- 10 + y_breaks <- c( + 0,1,2,5, seq(10, y_max_display, by = y_axis_interval)) + } + } + } + } + + ## Make plot - total copy number + minor copy number + cna_plot <- ggplot2::ggplot( + cna_segments_global, + ggplot2::aes( + x = .data$SegmentStart, + y = .data$CN_TOTAL, z = .data$SegmentInfo)) + + ggplot2::geom_segment( + data = cna_segments_global |> + dplyr::mutate(Track = "Minor copy number"), + ggplot2::aes( + x = .data$SegmentStart, + xend = .data$SegmentEnd, + y = .data$CN_MINOR, + yend = .data$CN_MINOR, + colour = .data$Track), linewidth = 1) + + ggplot2::geom_segment( + data = cna_segments_global |> + dplyr::mutate(Track = "Total copy number"), + ggplot2::aes( + x = .data$SegmentStart, + xend = .data$SegmentEnd, + y = .data$CN_TOTAL, + yend = .data$CN_TOTAL, + colour = .data$Track), linewidth = 1.6) + + ggplot2::scale_color_manual( + values = c(`Total copy number` = "black", + `Minor copy number` = "firebrick3")) + + ggplot2::geom_vline( + xintercept = unique( + c(0,as.vector(cna_segments_global$genome_end),upp)), + linetype="dotted", colour = "gray") + + ggplot2::scale_x_continuous( + breaks = c(0, reference_coordinates$midpoint, upp), + labels = + c("", gsub(pattern = 'chr', + replacement = '', + reference_coordinates$chr), "")) + + ggplot2::theme_bw() + + ggplot2::xlab("Chromosome") + + ggplot2::ylab("Absolute allele counts") + + ggplot2::scale_y_continuous( + breaks = y_breaks) + + ggplot2::theme( + legend.position = "bottom", + legend.title = ggplot2::element_blank(), + legend.margin = ggplot2::margin( + 0, 0, 0, 0)) + +} + diff --git a/pcgrr/R/expression.R b/pcgrr/R/expression.R index 786739ef..1d02e901 100644 --- a/pcgrr/R/expression.R +++ b/pcgrr/R/expression.R @@ -38,14 +38,19 @@ generate_report_data_expression <- pcg_report_expression[["expression"]] <- exp_data if("SYMBOL" %in% colnames(exp_data) == FALSE | - "TPM" %in% colnames(exp_data) == FALSE | + ("TPM" %in% colnames(exp_data) == FALSE & + "TPM_GENE" %in% colnames(exp_data) == FALSE) | "BIOTYPE" %in% colnames(exp_data) == FALSE){ pcgrr::log4r_warn( - "Missing a required column in expression file: SYMBOL, TPM, BIOTYPE") + "Missing a required column in expression file: SYMBOL, TPM/TPM_GENE, BIOTYPE") }else{ n_pc <- sum(exp_data$BIOTYPE == "protein_coding") + if("TPM_GENE" %in% colnames(exp_data)){ + exp_data$TPM <- as.numeric(exp_data$TPM_GENE) + } + if(n_pc > 0){ pcgrr::log4r_info( "Estimating immune contexture of tumor sample from RNA-seq data") diff --git a/pcgrr/R/input_data.R b/pcgrr/R/input_data.R index 5910db58..d1c374a5 100644 --- a/pcgrr/R/input_data.R +++ b/pcgrr/R/input_data.R @@ -1,13 +1,15 @@ -#' Function that reads and validates a fully annotated CNA file from PCGR -#' pre-report pipeline +#' Function that reads and validates fully annotated CNA data (segments and genes) +#' from PCGR pre-reporting pipeline #' -#' @param fname Path to file with pre-processed CNA segments +#' @param fname_cna_segment Path to file with pre-processed CNA segments +#' @param fname_cna_gene Path to file with pre-processed CNA gene-level data #' @param ref_data PCGR reference data object #' @param settings PCGR run/configuration settings #' #' @export load_somatic_cna <- function( - fname, + fname_cna_segment = NULL, + fname_cna_gene = NULL, ref_data = NULL, settings = NULL) { @@ -23,9 +25,63 @@ load_somatic_cna <- function( tumor_site <- settings[['conf']][['sample_properties']][['site']] + ## read segments + pcgrr::check_file_exists(fname_cna_segment) + pcgrr::check_file_exists(fname_cna_gene) + segments_raw <- suppressWarnings( + as.data.frame( + readr::read_tsv( + file = fname_cna_segment, + na = c(".","NA"), + show_col_types = F, + progress = F + ) + ) + ) + + compulsary_cols <- + names(pcgrr::data_coltype_defs$cna_somatic_segment_raw$cols) + + raw_col_check <- + rlang::has_name(segments_raw, compulsary_cols) + if (FALSE %in% raw_col_check) { + missing_cols <- + compulsary_cols[!raw_col_check] + log4r_fatal( + paste0("Missing required columns in input file ", + fname_cna_segment, " - ", + paste(missing_cols, collapse=", "))) + } + + segments <- suppressWarnings( + as.data.frame( + readr::read_tsv( + file = fname_cna_segment, + col_types = + pcgrr::data_coltype_defs$cna_somatic_segment_raw$cols, + na = c(".","NA"), + progress = F + ) + )) |> + tidyr::separate( + col = "SEGMENT_NAME", + into = c("SEGMENT_ID", "N_MAJOR","N_MINOR","ARM","CYTOBAND","EVENT_TYPE"), + sep = "\\|", + remove = T + ) |> + dplyr::mutate( + CN_TOTAL = as.integer( + as.integer(.data$N_MAJOR) + as.integer(.data$N_MINOR)) + ) |> + dplyr::rename(CN_MINOR = "N_MINOR", + CN_MAJOR = "N_MAJOR") |> + dplyr::mutate(CN_MINOR = as.integer(.data$CN_MINOR), + CN_MAJOR = as.integer(.data$CN_MAJOR)) + + callset_cna <- pcgrr::load_dna_variants( - fname = fname, - cols = pcgrr::data_coltype_defs$cna_somatic_raw, + fname = fname_cna_gene, + cols = pcgrr::data_coltype_defs$cna_somatic_gene_raw, ref_data = ref_data, vartype = 'cna', primary_site = @@ -33,6 +89,7 @@ load_somatic_cna <- function( retained_info_tags = "None", variant_origin = "Somatic") + callset_cna[['segment']] <- segments if (NROW(callset_cna$variant) > 0) { callset_cna[['variant']] <- callset_cna[['variant']] |> @@ -162,8 +219,6 @@ load_somatic_snv_indel <- function( pcgrr::append_targeted_drug_annotations( ref_data = ref_data, primary_site = tumor_site) |> - tidyr::separate(.data$HGVSc, c("ENST", "tmp_HGVSc"), - sep = ":", remove = F) |> dplyr::mutate( MOLECULAR_ALTERATION = dplyr::case_when( is.na(.data$SYMBOL) & @@ -173,19 +228,25 @@ load_somatic_snv_indel <- function( stringr::str_detect( .data$CONSEQUENCE, "^(splice_acceptor|splice_donor)") & !is.na(.data$SYMBOL) & - !is.na(.data$tmp_HGVSc) ~ + !is.na(.data$HGVSc) ~ paste0(.data$SYMBOL," ", stringr::str_replace_all( .data$CONSEQUENCE,"&",", "), " - ", - .data$tmp_HGVSc), + .data$HGVSc), .data$EXONIC_STATUS == "exonic" & + !stringr::str_detect( + .data$CONSEQUENCE, "^(splice_acceptor|splice_donor)") & !is.na(.data$CONSEQUENCE) & + !is.na(.data$HGVSc) & + !is.na(.data$SYMBOL) & !is.na(.data$HGVSP) ~ paste0(.data$SYMBOL," ", stringr::str_replace_all( .data$CONSEQUENCE, "&",", "), " - ", + .data$HGVSc, + " - ", .data$HGVSP), TRUE ~ as.character(paste0( .data$SYMBOL," ", @@ -193,7 +254,6 @@ load_somatic_snv_indel <- function( .data$CONSEQUENCE,"&",", ")) ) )) |> - dplyr::select(-c("tmp_HGVSc","ENST")) |> pcgrr::order_variants(pos_var = 'POS') |> pcgrr::exclude_non_chrom_variants() @@ -221,11 +281,6 @@ load_somatic_snv_indel <- function( dplyr::filter( .data$SOMATIC_CLASSIFICATION == "SOMATIC") - ## filter also MAF file if provided - pcgrr::filter_maf_file( - callset = callset, - settings = settings) - ## Issue warning if clinically actionable variants are filtered ## with current filtering settings n_actionable_filtered <- @@ -275,6 +330,11 @@ load_somatic_snv_indel <- function( } } + ## filter also MAF file if provided + pcgrr::filter_maf_file( + callset = callset, + settings = settings) + }else{ pcgrr::log4r_fatal( "Variant data.frame is lacking a 'SOMATIC_CLASSIFICATION' column") @@ -285,9 +345,10 @@ load_somatic_snv_indel <- function( callset[['variant']] <- callset[['variant']] |> dplyr::arrange( .data$ACTIONABILITY_TIER, + dplyr::desc(.data$ONCOGENICITY_SCORE), dplyr::desc(.data$GLOBAL_ASSOC_RANK), - dplyr::desc(.data$TISSUE_ASSOC_RANK), - dplyr::desc(.data$ONCOGENICITY_SCORE)) + dplyr::desc(.data$TISSUE_ASSOC_RANK)) + ## Make data frame with columns for display ## in HTML output @@ -324,6 +385,141 @@ load_somatic_snv_indel <- function( return(callset) +} + +#' Function that reads CPSR-classified variants from a TSV file +#' +#' @param fname_cpsr_tsv Path to raw input file with CPSR-classified SNVs/InDels +#' @param fname_cpsr_yaml Path to YAML configuration file for CPSR analysis +#' @param cols column type definitions of raw input file +#' @param ignore_vus logical indicating if VUS should be ignored in report +#' @param ref_data PCGR reference data object +#' +#' @export +#' +load_cpsr_classified_variants <- function( + fname_cpsr_tsv = NA, + fname_cpsr_yaml = NA, + cols = NULL, + ignore_vus = FALSE, + ref_data = NULL){ + + pcgrr::log4r_info("------") + pcgrr::log4r_info(paste0( + "Reading annotated molecular dataset (DNA) - germline SNV/InDels (CPSR-classified)")) + + pcgrr::check_file_exists(fname_cpsr_tsv) + pcgrr::check_file_exists(fname_cpsr_yaml) + + if (!file.exists(fname_cpsr_yaml)) { + log4r_fatal( + paste0("YAML file '", fname_cpsr_yaml, "' does not exist - exiting")) + } + cpsr_yaml <- yaml::read_yaml(fname_cpsr_yaml) + if("conf" %in% names(cpsr_yaml) == FALSE){ + pcgrr::log4r_fatal( + paste0( + "YAML file '", fname_cpsr_yaml, + "' does not contain a 'conf' section - exiting")) + } + if("sample_id" %in% names(cpsr_yaml) == FALSE){ + pcgrr::log4r_fatal( + paste0( + "YAML file '", fname_cpsr_yaml, + "' does not contain a 'sample_id' variable - exiting")) + } + if("gene_panel" %in% names(cpsr_yaml$conf) == FALSE){ + pcgrr::log4r_fatal( + paste0( + "YAML file '", fname_cpsr_yaml, + "' does not contain a 'conf->gene_panel' section - exiting")) + } + panel_info <- list() + panel_info[['description']] <- + cpsr_yaml$conf$gene_panel[['description']] + panel_info[['description_trait']] <- + cpsr_yaml$conf$gene_panel[['description_trait']] + panel_info[['url']] <- cpsr_yaml$conf$gene_panel[['url']] + panel_info[['panel_id']] <- cpsr_yaml$conf$gene_panel[['panel_id']] + sample_id <- cpsr_yaml$sample_id + + callset <- pcgrr::load_dna_variants( + fname = fname_cpsr_tsv, + cols = cols, + ref_data = ref_data, + vartype = 'snv_indel', + variant_origin = 'Germline') + + callset[['variant_display']] <- callset[['variant']] |> + pcgrr::append_cancer_gene_evidence( + ref_data = ref_data) |> + dplyr::mutate( + CLINVAR_TRAITS_ALL = paste( + stringr::str_to_title(.data$CLINVAR_VARIANT_ORIGIN), + .data$CLINVAR_PHENOTYPE, + sep = " - ")) |> + pcgrr::append_annotation_links() |> + dplyr::select( + -dplyr::contains("_RAW") + ) |> + dplyr::mutate( + CONSEQUENCE = stringr::str_replace_all( + .data$CONSEQUENCE,"&",", ")) |> + dplyr::rename( + SOURCE = .data$CPSR_CLASSIFICATION_SOURCE, + CLINICAL_SIGNIFICANCE = .data$FINAL_CLASSIFICATION + ) |> + dplyr::select( + -dplyr::any_of( + c("CLINVAR_TRAITS_ALL", + "CLINVAR_VARIANT_ORIGIN", + "CLINVAR_PHENOTYPE", + "PFAM_DOMAIN", + "CANCERGENE_EVIDENCE", + "PFAM_DOMAIN_NAME", + "PROTEIN_CHANGE", + "CLINVAR_MSID", + "VAR_ID", + "ENTREZGENE"))) |> + dplyr::select( + dplyr::any_of( + c("SYMBOL","ALTERATION","GENOTYPE","CONSEQUENCE", + "CLINICAL_SIGNIFICANCE","SOURCE","PROTEIN_DOMAIN", + "HGVSc", "HGVSc_RefSeq", "HGVSp", "CDS_CHANGE", + "CODING_STATUS", + "LOSS_OF_FUNCTION", "DP_CONTROL", + "VARIANT_CLASS","GENENAME", + "ONCOGENE","TUMOR_SUPPRESSOR","ENSEMBL_GENE_ID", + "ENSEMBL_TRANSCRIPT_ID","REFSEQ_TRANSCRIPT_ID", + "DBSNP_RSID", + "CLINVAR","CLINVAR_CLASSIFICATION","CLINVAR_CONFLICTED", + "CLINVAR_REVIEW_STATUS_STARS", + "CPSR_PATHOGENICITY_SCORE", + "CPSR_CLASSIFICATION", + "CPSR_CLASSIFICATION_CODE")), + dplyr::everything() + ) + + if(NROW(callset[['variant_display']]) > 0){ + callset[['variant_display']] <- callset[['variant_display']] |> + dplyr::filter((.data$CLINICAL_SIGNIFICANCE == "Pathogenic" | + .data$CLINICAL_SIGNIFICANCE == "Likely_Pathogenic" | + .data$CLINICAL_SIGNIFICANCE == "VUS") & + .data$CODING_STATUS == "coding")|> + dplyr::distinct() + if(ignore_vus == TRUE){ + callset[['variant_display']] <- callset[['variant_display']] |> + dplyr::filter(.data$CLINICAL_SIGNIFICANCE != "VUS") |> + dplyr::distinct() + } + } + + return(list('callset' = callset, + 'panel_info' = panel_info, + 'sample_id' = sample_id, + eval = TRUE)) + + } #' Function that reads and validates CNA or SNV/InDel TSV files @@ -855,9 +1051,10 @@ load_dna_variants <- function( ) } - }else{ - log4r_fatal("Input data does not contain 'BIOMARKER_MATCH' column - fatal") } + #else{ + #log4r_fatal("Input data does not contain 'BIOMARKER_MATCH' column - fatal") + #} results[['retained_info_tags']] <- paste( retained_cols_renamed, collapse="," @@ -911,13 +1108,15 @@ load_expression_similarity <- function(settings = NULL){ #' @param ref_data PCGR reference data object #' @param percentile_cutoff_high numeric, percentile cutoff for high expression #' @param percentile_cutoff_low numeric, percentile cutoff for low expression +#' @param z_score_cutoff numeric, z-score cutoff for expression outliers #' #' @export load_expression_outliers <- function( settings = NULL, ref_data = NULL, - percentile_cutoff_high = 90, - percentile_cutoff_low = 10){ + percentile_cutoff_high = 95, + percentile_cutoff_low = 5, + z_score_cutoff = 1.5){ ## Load expression outlier results for input sample ## against reference collections @@ -994,34 +1193,28 @@ load_expression_outliers <- function( "ENSEMBL_TRANSCRIPT_ID")) |> dplyr::select( -dplyr::contains("_RAW")) |> - dplyr::mutate(kIQR = dplyr::if_else( - !is.na(.data$IQR) & - .data$IQR > 0 & - !is.na(.data$TPM_GENE) & - .data$TPM_GENE > 0 & - !is.na(.data$Q2), - as.numeric(.data$TPM_GENE - .data$Q2) / .data$IQR, - 0 - )) |> ## define criteria for expression outliers - dplyr::mutate(OUTLIER = dplyr::case_when( + dplyr::mutate(EXPR_OUTLIER = dplyr::case_when( PERCENTILE >= percentile_cutoff_high & - TPM_GENE > (Q3 + 1.5 * IQR) ~ TRUE, + .data$IQR > 0 & + TPM_LOG2_GENE > (.data$Q3 + (1.5 * .data$IQR)) & + abs(.data$Z_SCORE) > z_score_cutoff ~ "Increased expression", PERCENTILE <= percentile_cutoff_low & - TPM_GENE < (Q1 - 1.5 * IQR) ~ TRUE, - TRUE ~ FALSE + .data$IQR > 0 & + TPM_LOG2_GENE < (.data$Q1 - (1.5 * .data$IQR)) & + abs(.data$Z_SCORE) > z_score_cutoff ~ "Reduced expression", + TRUE ~ as.character(NA) )) |> dplyr::filter( - .data$TPM_GENE > 1 & - .data$IQR > 0 & - .data$GENE_BIOTYPE == "protein_coding") + .data$GENE_BIOTYPE == "protein_coding") |> + dplyr::mutate(Z_SCORE = round(.data$Z_SCORE, 1)) if(NROW(expression_outliers) == 0){ return(expression_outliers) } expression_outliers <- expression_outliers |> - dplyr::filter(.data$OUTLIER == TRUE) + dplyr::filter(!is.na(.data$EXPR_OUTLIER)) if(NROW(expression_outliers) == 0){ return(expression_outliers) @@ -1043,17 +1236,27 @@ load_expression_outliers <- function( expression_outliers <- expression_outliers |> dplyr::arrange( - dplyr::desc(.data$kIQR), + dplyr::desc(abs(.data$Z_SCORE)), dplyr::desc(.data$GLOBAL_ASSOC_RANK), dplyr::desc(.data$TISSUE_ASSOC_RANK)) |> - dplyr::mutate(TPM_GENE = round( - .data$TPM_GENE, digits = 2)) |> + dplyr::mutate(EXPR_LEVEL = round( + .data$TPM_LOG2_GENE, digits = 3)) |> + dplyr::rename(REF_COHORT_IQR = "IQR") |> + dplyr::mutate(REF_COHORT_QUARTILES = paste( + paste0("Q1: ",.data$Q1), + paste0("Q2: ",.data$Q2), + paste0("Q3: ",.data$Q3), + sep = " | ")) |> dplyr::select( c("SYMBOL", "GENENAME", - "TPM_GENE", + "EXPR_LEVEL", + "EXPR_OUTLIER", + "Z_SCORE", "REF_COHORT", "REF_COHORT_SIZE", + "REF_COHORT_IQR", + "REF_COHORT_QUARTILES", "PERCENTILE", "TUMOR_SUPPRESSOR", "ONCOGENE", @@ -1062,8 +1265,7 @@ load_expression_outliers <- function( "GENE_BIOTYPE", "GLOBAL_ASSOC_RANK", "TISSUE_ASSOC_RANK", - "CANCERGENE_EVIDENCE", - "IQR")) |> + "CANCERGENE_EVIDENCE")) |> dplyr::mutate(REF_COHORT = toupper( stringr::str_replace_all( .data$REF_COHORT,"_","-") diff --git a/pcgrr/R/maf.R b/pcgrr/R/maf.R index efbd0cb0..75c32283 100644 --- a/pcgrr/R/maf.R +++ b/pcgrr/R/maf.R @@ -56,7 +56,7 @@ filter_maf_file <- function(callset, settings) { TRUE ~ "MNP" )) |> dplyr::mutate( - Chromosome = .data$CHROM, + Chromosome = as.character(.data$CHROM), Start_Position = dplyr::case_when( .data$Variant_Type == "DEL" & substr(.data$REF, 1, 1) == .data$ALT ~ .data$POS + 1, @@ -88,6 +88,24 @@ filter_maf_file <- function(callset, settings) { show_col_types = F, col_names = T, comment = "#", na = "" ) + + if(NROW(maf_data_unfiltered) == 0) { + pcgrr::log4r_warn("MAF file is empty - no filtering will be performed") + return(0) + }else{ + maf_data_unfiltered$Chromosome <- + as.character(maf_data_unfiltered$Chromosome) + + if(is.logical(maf_data_unfiltered$Tumor_Seq_Allele1)) { + maf_data_unfiltered$Tumor_Seq_Allele1 <- + as.character("T") + } + if(is.logical(maf_data_unfiltered$Tumor_Seq_Allele2)) { + maf_data_unfiltered$Tumor_Seq_Allele2 <- + as.character("T") + } + } + } else { pcgrr::log4r_warn("MAF file is empty - no filtering will be performed") return(0) @@ -109,8 +127,10 @@ filter_maf_file <- function(callset, settings) { maf_data_filtered <- maf_data_unfiltered |> dplyr::semi_join( filtered_vars_maf_like, - by = c("Chromosome", "Start_Position", - "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", + by = c("Chromosome", + "Start_Position", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2", "Variant_Type") ) diff --git a/pcgrr/R/main.R b/pcgrr/R/main.R index c3a76269..0f54c5e1 100644 --- a/pcgrr/R/main.R +++ b/pcgrr/R/main.R @@ -47,7 +47,19 @@ generate_report <- settings = settings) } - + ## Load pre-classified germline variants (output from CPSR) + if(settings$molecular_data$fname_germline_tsv != "None" & + file.exists(settings$molecular_data$fname_germline_tsv)){ + rep[['content']][['germline_classified']] <- + pcgrr::load_cpsr_classified_variants( + fname_cpsr_tsv = settings$molecular_data$fname_germline_tsv, + fname_cpsr_yaml = settings$molecular_data$fname_germline_yaml, + ignore_vus = as.logical(settings$conf$germline$ignore_vus), + cols = pcgrr::data_coltype_defs$snv_indel_germline_cpsr, + ref_data = ref_data + ) + rep[["content"]][['germline_classified']][["eval"]] <- TRUE + } conf_somatic_snv <- settings$conf$somatic_snv @@ -73,8 +85,6 @@ generate_report <- # a_elem = "clinicaltrials") #} - - rep[['content']][['snv_indel']][['callset']] <- callset_snv rep[['content']][['snv_indel']][['eval']] <- @@ -182,10 +192,12 @@ generate_report <- ## Load somatic CNA data if available callset_cna <- NULL - if (settings$molecular_data$fname_cna_tsv != "None") { + if (settings$molecular_data$fname_cna_gene_tsv != "None" & + settings$molecular_data$fname_cna_segment_tsv != "None") { callset_cna <- pcgrr::load_somatic_cna( - fname = settings$molecular_data$fname_cna_tsv, + fname_cna_segment = settings$molecular_data$fname_cna_segment_tsv, + fname_cna_gene = settings$molecular_data$fname_cna_gene_tsv, ref_data = ref_data, settings = settings ) @@ -197,12 +209,6 @@ generate_report <- callset = callset_cna, vartype = "cna", name = "vstats")[['vstats']] - #rep[['content']][['cna']][['cnaqc']] <- - # pcgrr::make_cnaqc_object( - # callset_cna = callset_cna, - # callset_snv = callset_snv, - # settings = settings - # ) rep[['content']][['cna']][['eval']] <- TRUE } @@ -764,7 +770,7 @@ generate_tier_tsv <- function(variant_set, #' #' @param report List object with all report data, settings etc. #' @param output_type character indicating output type for TSV, -#' i.e. 'snv_indel' or 'cna_gene', 'msigs' +#' i.e. 'snv_indel', 'snv_indel_unfiltered', 'cna_gene', or 'msigs' #' @export #' write_report_tsv <- function(report = NULL, output_type = 'snv_indel'){ @@ -792,6 +798,12 @@ write_report_tsv <- function(report = NULL, output_type = 'snv_indel'){ report$content$snv_indel$eval == TRUE){ eval_output <- TRUE } + if(output_type == "snv_indel_unfiltered" & + report$content$snv_indel$eval == TRUE & + as.logical( + report$settings$conf$assay_properties$vcf_tumor_only) == TRUE){ + eval_output <- TRUE + } ## Mutational signatures @@ -847,6 +859,45 @@ write_report_tsv <- function(report = NULL, output_type = 'snv_indel'){ } } + ## SNVs/InDels + if(output_type == 'snv_indel_unfiltered' & + !is.null(report$content$snv_indel) & + report$content$snv_indel$eval == TRUE){ + + snv_indel_cols <- pcgrr::tsv_cols$snv_indel_unfiltered + if(report$settings$conf$other$retained_vcf_info_tags != "None"){ + snv_indel_cols <- c( + snv_indel_cols, report$settings$conf$other$retained_vcf_info_tags) + } + + if(!is.null(report$content$snv_indel$callset)){ + if(is.data.frame(report$content$snv_indel$callset$variant_unfiltered)){ + output_data <- as.data.frame( + report$content$snv_indel$callset$variant_unfiltered |> + dplyr::select( + dplyr::any_of(snv_indel_cols)) + ) + if(NROW(output_data) > 0 & + "SOMATIC_CLASSIFICATION" %in% colnames(output_data) & + "ACTIONABILITY_TIER" %in% colnames(output_data) & + "ONCOGENICITY_SCORE" %in% colnames(output_data)){ + output_data$somatic_score <- 0 + output_data <- output_data |> + dplyr::mutate(somatic_score = dplyr::case_when( + .data$SOMATIC_CLASSIFICATION == "Somatic" ~ 1, + TRUE ~ as.numeric(.data$somatic_score) + )) + output_data <- output_data |> + dplyr::arrange( + dplyr::desc(.data$somatic_score), + .data$ACTIONABILITY_TIER, + dplyr::desc(.data$ONCOGENICITY_SCORE)) |> + dplyr::select(-c("somatic_score")) + } + } + } + } + if(NROW(output_data) > 0){ pcgrr::log4r_info("------") pcgrr::log4r_info(paste0( @@ -925,6 +976,10 @@ write_report_quarto_html <- function(report = NULL){ ## Save sample PCGR report object in temporary quarto rendering directory rds_report_path <- file.path( tmp_quarto_dir, "pcgr_report.rds") + + ## Remove ref_data from report object + #report$settings$chrom_coordinates <- + # report$ref_data$assembly$chrom_coordinates report$ref_data <- NULL saveRDS(report, file = rds_report_path) @@ -1000,15 +1055,17 @@ write_report_excel <- function(report = NULL){ i <- 15 for(elem in c('SAMPLE_ASSAY', - 'SNV_INDEL', - 'SNV_INDEL_BIOMARKER', - 'CNA', - 'CNA_BIOMARKER', + 'SOMATIC_SNV_INDEL', + 'SOMATIC_SNV_INDEL_BIOMARKER', + 'SOMATIC_CNA', + 'SOMATIC_CNA_BIOMARKER', + 'GERMLINE_SNV_INDEL', 'TMB', 'MSI', 'MUTATIONAL_SIGNATURE', - 'KATAEGIS', - 'IMMUNE_CONTEXTURE')){ + 'KATAEGIS_EVENTS', + 'RNA_EXPRESSION_OUTLIERS', + 'RNA_IMMUNE_CONTEXTURE')){ if(elem %in% names(excel_output)){ if(is.data.frame(excel_output[[elem]]) & NROW(excel_output[[elem]]) > 0 & diff --git a/pcgrr/R/mutational_burden.R b/pcgrr/R/mutational_burden.R index ade01a47..abf3e781 100644 --- a/pcgrr/R/mutational_burden.R +++ b/pcgrr/R/mutational_burden.R @@ -103,7 +103,7 @@ plot_tmb_primary_site_tcga <- function( TMB = dplyr::case_when( tmb_display_type == "coding_and_silent" ~ as.numeric(.data$TMB_CODING_AND_SILENT) + 0.001, - tmb_display_type == "coding_non_ssilent" ~ + tmb_display_type == "coding_non_silent" ~ as.numeric(.data$TMB_CODING_NON_SILENT) + 0.001, tmb_display_type == "missense_only" ~ as.numeric(.data$TMB_MISSENSE_ONLY) + 0.001, diff --git a/pcgrr/R/mutational_signatures.R b/pcgrr/R/mutational_signatures.R index 78476526..013d19e9 100644 --- a/pcgrr/R/mutational_signatures.R +++ b/pcgrr/R/mutational_signatures.R @@ -423,7 +423,7 @@ generate_report_data_signatures <- dplyr::anti_join(missing_signatures, by = c("signature_id_group" = "signature_id")) |> dplyr::group_by( - group, prop_group + .data$group, .data$prop_group ) |> dplyr::summarise( signature_id_group = paste( @@ -764,15 +764,17 @@ generate_report_data_rainfall <- function(variant_set, labels <- gsub("chr", "", names(chr_length)) # position of chromosome labels - m <- c() + chrom_midpoints <- c() for (i in 2:length(chr_cum)) - m <- c(m, (chr_cum[i - 1] + chr_cum[i]) / 2) + chrom_midpoints <- c( + chrom_midpoints, (chr_cum[i - 1] + chr_cum[i]) / 2) # mutation characteristics type <- c() loc <- c() dist <- c() chrom <- c() + variant_id <- c() # for each chromosome #chromosomes <- @@ -786,28 +788,43 @@ generate_report_data_rainfall <- function(variant_set, loc <- c(loc, (chr_subset$POS + chr_cum[i])[-1]) dist <- c(dist, diff(chr_subset$POS)) chrom <- c(chrom, rep(chromosomes[i], n - 1)) + varid <- paste0(chr_subset$CHROM[-1], ":", + chr_subset$POS[-1],":", + chr_subset$REF[-1], ">", + chr_subset$ALT[-1]) + variant_id <- c(variant_id, varid) } - invisible(assertthat::assert_that(length(type) == length(loc) & - length(loc) == length(dist) & - length(chrom) == length(dist), - msg = "Length of type/loc/dist not identical")) - data <- data.frame(type = type, - location = loc, - distance = dist, - chromosome = chrom, - stringsAsFactors = F) + invisible( + assertthat::assert_that( + length(type) == length(loc) & + length(loc) == length(dist) & + length(chrom) == length(dist), + msg = "Length of type/loc/dist not identical")) + data <- data.frame( + type = type, + location = loc, + variant_id = variant_id, + dist2prev = dist, + chromosome = chrom, + stringsAsFactors = F) # Removes colors based on missing mutation types. This prevents colors from # shifting when comparing samples with low mutation counts. typesin <- sbs_types %in% sort(unique(data$type)) colors_selected <- colors[typesin] - ylim <- 1e+09 + #ylim <- 1e+09 + ylim <- 1e+10 pcg_report_rainfall[["eval"]] <- T pcg_report_rainfall[["rfdata"]] <- - list("data" = data, "intercept" = m, "ylim" = ylim, - "chr_cum" = chr_cum, "colors" = colors_selected, - "labels" = labels, "cex" = 0.8, "cex_text" = 3) + list("data" = data, + "chrom_midpoints" = chrom_midpoints, + "ylim" = ylim, + "chr_cum" = chr_cum, + "colors" = colors_selected, + "chrom_labels" = labels, + "cex" = 0.8, + "cex_text" = 3) } return(pcg_report_rainfall) } diff --git a/pcgrr/R/output_data.R b/pcgrr/R/output_data.R index fb97e04e..923a0d44 100644 --- a/pcgrr/R/output_data.R +++ b/pcgrr/R/output_data.R @@ -75,6 +75,7 @@ get_excel_sheets <- function(report = NULL){ excel_sheets[['TMB']] <- report$content$tmb$sample_estimate } } + ## Mutational signatures if(!is.null(report$content$mutational_signatures)){ if(report$content$mutational_signatures$missing_data == FALSE){ @@ -90,6 +91,20 @@ get_excel_sheets <- function(report = NULL){ } } + ## Kataegis events + if(!is.null(report$content$kataegis$events)){ + if(report$content$kataegis$eval == TRUE){ + if(NROW(report$content$kataegis$events) > 0){ + colnames(report$content$kataegis$events) <- + toupper(colnames(report$content$kataegis$events)) + + excel_sheets[['KATAEGIS_EVENTS']] <- + report$content$kataegis$events + } + } + } + + ## MSI if(!is.null(report$content$msi)){ if(report$content$msi$missing_data == FALSE){ @@ -113,9 +128,18 @@ get_excel_sheets <- function(report = NULL){ colnames(report$content$expression$immune_contexture) <- toupper(colnames(report$content$expression$immune_contexture)) - excel_sheets[['IMMUNE_CONTEXTURE']] <- + excel_sheets[['RNA_IMMUNE_CONTEXTURE']] <- report$content$expression$immune_contexture } + + if("outliers" %in% names(report$content$expression)){ + + excel_sheets[['RNA_EXPRESSION_OUTLIERS']] <- + report$content$expression$outliers |> + dplyr::select(-dplyr::any_of( + c("GENENAME","CANCERGENE_EVIDENCE", + "TARGETED_INHIBITORS_ALL","ENSEMBL_GENE_ID"))) + } } ## Copy number alterations @@ -123,7 +147,7 @@ get_excel_sheets <- function(report = NULL){ report$content$cna$eval == TRUE){ if(NROW(report$content$cna$callset$variant) > 0){ - excel_sheets[['CNA']] <- as.data.frame( + excel_sheets[['SOMATIC_CNA']] <- as.data.frame( report$content$cna$callset$variant |> dplyr::select(dplyr::any_of(pcgrr::tsv_cols$cna)) |> dplyr::select(-dplyr::any_of("BIOMARKER_MATCH")) |> @@ -131,7 +155,7 @@ get_excel_sheets <- function(report = NULL){ ) ## Evidence items - biomarkers - excel_sheets[['CNA_BIOMARKER']] <- data.frame() + excel_sheets[['SOMATIC_CNA_BIOMARKER']] <- data.frame() i <- 1 while(i <= 2){ tier_data <- @@ -160,14 +184,14 @@ get_excel_sheets <- function(report = NULL){ c("SAMPLE_ID","SAMPLE_ALTERATION", "ACTIONABILITY_TIER"), dplyr::everything() ) - excel_sheets[['CNA_BIOMARKER']] <- dplyr::bind_rows( - excel_sheets[['CNA_BIOMARKER']], edata) + excel_sheets[['SOMATIC_CNA_BIOMARKER']] <- dplyr::bind_rows( + excel_sheets[['SOMATIC_CNA_BIOMARKER']], edata) } i <- i + 1 } - if(NROW(excel_sheets[['CNA_BIOMARKER']]) > 0){ - excel_sheets[['CNA_BIOMARKER']] <- - excel_sheets[['CNA_BIOMARKER']] |> + if(NROW(excel_sheets[['SOMATIC_CNA_BIOMARKER']]) > 0){ + excel_sheets[['SOMATIC_CNA_BIOMARKER']] <- + excel_sheets[['SOMATIC_CNA_BIOMARKER']] |> dplyr::distinct() |> dplyr::arrange( .data$SAMPLE_ID, @@ -189,7 +213,7 @@ get_excel_sheets <- function(report = NULL){ } if(NROW(report$content$snv_indel$callset$variant) > 0){ - excel_sheets[['SNV_INDEL']] <- + excel_sheets[['SOMATIC_SNV_INDEL']] <- report$content$snv_indel$callset$variant |> dplyr::select( dplyr::any_of(snv_indel_cols)) |> @@ -203,7 +227,7 @@ get_excel_sheets <- function(report = NULL){ -dplyr::any_of(c("BIOMARKER_MATCH","VEP_ALL_CSQ"))) ## Evidence items - biomarkers - excel_sheets[['SNV_INDEL_BIOMARKER']] <- data.frame() + excel_sheets[['SOMATIC_SNV_INDEL_BIOMARKER']] <- data.frame() i <- 1 while(i <= 2){ tier_data <- @@ -233,14 +257,14 @@ get_excel_sheets <- function(report = NULL){ dplyr::everything() ) - excel_sheets[['SNV_INDEL_BIOMARKER']] <- dplyr::bind_rows( - excel_sheets[['SNV_INDEL_BIOMARKER']], edata) + excel_sheets[['SOMATIC_SNV_INDEL_BIOMARKER']] <- dplyr::bind_rows( + excel_sheets[['SOMATIC_SNV_INDEL_BIOMARKER']], edata) } i <- i + 1 } - if(NROW(excel_sheets[['SNV_INDEL_BIOMARKER']]) > 0){ - excel_sheets[['SNV_INDEL_BIOMARKER']] <- - excel_sheets[['SNV_INDEL_BIOMARKER']] |> + if(NROW(excel_sheets[['SOMATIC_SNV_INDEL_BIOMARKER']]) > 0){ + excel_sheets[['SOMATIC_SNV_INDEL_BIOMARKER']] <- + excel_sheets[['SOMATIC_SNV_INDEL_BIOMARKER']] |> dplyr::distinct() |> dplyr::arrange( .data$SAMPLE_ID, diff --git a/pcgrr/R/reference_data.R b/pcgrr/R/reference_data.R index 44918f9c..f50197e8 100644 --- a/pcgrr/R/reference_data.R +++ b/pcgrr/R/reference_data.R @@ -11,8 +11,6 @@ load_reference_data <- function( pcgr_ref_data <- list() - - log4r_info(paste0( "Loading reference datasets - genome assembly: ", genome_assembly)) @@ -35,6 +33,28 @@ load_reference_data <- function( genome = genome_grch2hg[genome_assembly]) pcgr_ref_data[['assembly']][['bsg']] <- bsgenome_obj + chromsizes_fname <- file.path( + pcgr_db_assembly_dir, + paste0("chromsize.", genome_assembly, ".tsv")) + check_file_exists(chromsizes_fname) + pcgr_ref_data[['assembly']][['chrom_coordinates']] <- + as.data.frame( + readr::read_tsv( + chromsizes_fname, show_col_types = F, + na = c(".") + ) + ) |> + dplyr::mutate( + genome_end = cumsum(.data$length)) |> + dplyr::mutate( + genome_start = .data$genome_end - .data$length + ) |> + dplyr::select( + c("chrom", "genome_start", "genome_end", + "length"), dplyr::everything()) + + + pcgr_ref_data[['vcf_infotags']] <- data.frame() for(t in c('vep','other')) { infotag_fname <- file.path( @@ -258,7 +278,9 @@ load_reference_data <- function( ####--- 2. Variant annotations----#### pcgr_ref_data[['variant']] <- list() - ## ClinVar + + #####--A. Sites of pathogenic variants (ClinVar)--##### + # 1. sites (codons) of pathogenic/likely pathogenic variants clinvar_sites_tsv_fname <- file.path( pcgr_db_assembly_dir, "variant", "tsv", @@ -272,6 +294,32 @@ load_reference_data <- function( colnames(pcgr_ref_data[['variant']][['clinvar_sites']]) <- toupper(colnames(pcgr_ref_data[['variant']][['clinvar_sites']])) + #####--B. Oncogenic variants (ClinVar)--##### + # known oncogenic variants + clinvar_oncogenic_tsv_fname <- + file.path( + pcgr_db_assembly_dir, "variant", "tsv", + "clinvar", "clinvar_oncogenic.tsv.gz" + ) + check_file_exists(clinvar_oncogenic_tsv_fname) + pcgr_ref_data[['variant']][['clinvar_oncogenic']] <- as.data.frame( + readr::read_tsv( + clinvar_oncogenic_tsv_fname, show_col_types = F)) |> + dplyr::mutate(entrezgene = as.character(.data$entrezgene)) |> + dplyr::mutate(alteration = dplyr::if_else( + is.na(.data$hgvsp) & !is.na(.data$hgvs_c), + as.character(.data$hgvs_c), + .data$hgvsp + )) |> + dplyr::select(-c("codon","trait","var_id")) |> + dplyr::select(c("symbol","alteration","molecular_consequence", + "oncogenicity","review_status_oncogenicity"), + dplyr::everything()) + + colnames(pcgr_ref_data[['variant']][['clinvar_oncogenic']]) <- + toupper(colnames(pcgr_ref_data[['variant']][['clinvar_oncogenic']])) + + #####--C. Gene-level variant statistics (ClinVar)--##### clinvar_gene_varstats_tsv_fname <- file.path( pcgr_db_assembly_dir, "variant", "tsv", @@ -285,7 +333,9 @@ load_reference_data <- function( colnames(pcgr_ref_data[['variant']][['clinvar_gene_stats']]) <- toupper(colnames(pcgr_ref_data[['variant']][['clinvar_gene_stats']])) - ## GWAS + + + #####--D. GWAS variants--##### gwas_tsv_fname <- file.path( pcgr_db_assembly_dir, "variant", "tsv", "gwas", "gwas.tsv.gz" diff --git a/pcgrr/R/report.R b/pcgrr/R/report.R index e19ddc28..e93a807e 100644 --- a/pcgrr/R/report.R +++ b/pcgrr/R/report.R @@ -86,10 +86,10 @@ init_report <- function(yaml_fname = NULL, "mutational_signatures", "tmb", "msi", + "germline_classified", "rainfall", "kataegis", - "expression", - "predisposition")){ + "expression")){ #"clinicaltrials")) { report[["content"]][[a_elem]] <- list() report[["content"]][[a_elem]][["eval"]] <- FALSE @@ -112,6 +112,12 @@ init_report <- function(yaml_fname = NULL, init_rainfall_content() } + if (a_elem == "germline_classified") { + report[["content"]][[a_elem]][['callset']] <- list() + report[["content"]][[a_elem]][['panel_info']] <- list() + report[["content"]][[a_elem]]$sample_id <- "NA" + } + if (a_elem == "snv_indel" | a_elem == "cna") { report[["content"]][[a_elem]][['callset']] <- list() @@ -122,8 +128,6 @@ init_report <- function(yaml_fname = NULL, if (a_elem == "cna") { report[["content"]][[a_elem]][['vstats']] <- init_cna_vstats() - report[["content"]][[a_elem]][['cnaqc']] <- - list() } } if (a_elem == "clinicaltrials") { @@ -548,12 +552,17 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { ref_data <- list() if (dir.exists( - report_settings[['reference_data']][['path']] - )) { + report_settings[['reference_data']][['path']])) { ref_data <- load_reference_data( pcgr_db_assembly_dir = report_settings[['reference_data']][['path']], genome_assembly = report_settings[['genome_assembly']] ) + }else{ + log4r_fatal( + paste0("Reference data directory ", + report_settings[['reference_data']][['path']], + " does not exist - exiting")) + } if (identical( @@ -686,12 +695,11 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { report_settings$conf$report_color <- pcgrr::color_palette[["report_color"]][["values"]][1] - #report_settings$conf$visual_reporting[["color_palette"]] <- - # pcgrr::color_palette - #report_settings$conf$visual_reporting[["color_none"]] <- - # pcgrr::color_palette[["none"]][["values"]][1] - #report_settings$conf$visual_reporting[["color_value_box"]] <- - # pcgrr::color_palette[["report_color"]][["values"]][1] + if(!is.null(ref_data$assembly$chrom_coordinates)){ + report_settings$chrom_coordinates <- + ref_data$assembly$chrom_coordinates + } + if (report_mode == "PCGR" & !is.null(report_settings$conf$assay_properties)) { if (report_settings$conf$assay_properties$vcf_tumor_only == 1) { diff --git a/pcgrr/R/variant_annotation.R b/pcgrr/R/variant_annotation.R index 0ea594e2..394da65d 100644 --- a/pcgrr/R/variant_annotation.R +++ b/pcgrr/R/variant_annotation.R @@ -270,7 +270,8 @@ append_tcga_var_link <- function(var_df, dplyr::select(.data$VAR_ID, .data$TCGALINK) |> dplyr::rename(TCGA_FREQUENCY = "TCGALINK") #magrittr::set_colnames(c("VAR_ID", "TCGA_FREQUENCY")) - var_df <- dplyr::rename(var_df, TCGA_FREQUENCY_RAW = .data$TCGA_FREQUENCY) + var_df <- dplyr::rename( + var_df, TCGA_FREQUENCY_RAW = .data$TCGA_FREQUENCY) var_df <- dplyr::left_join(var_df, var_df_links, by = c("VAR_ID" = "VAR_ID")) }else{ diff --git a/pcgrr/R/variant_stats.R b/pcgrr/R/variant_stats.R new file mode 100644 index 00000000..b01a841b --- /dev/null +++ b/pcgrr/R/variant_stats.R @@ -0,0 +1,115 @@ +#' Function that computes various variant statistics from a data frame +#' with variant records +#' +#' @param var_df data frame with variants +#' @param pct_other_limit numeric value specifying the percentage limit +#' for the 'Other' category +#' +#' @export +#' +get_variant_statistics <- function(var_df = NULL, pct_other_limit = 4){ + + assertthat::assert_that( + !is.null(var_df), + is.data.frame(var_df), + msg = "Argument 'var_df' must be a valid data.frame" + ) + + assertable::assert_colnames( + var_df, c("VARIANT_CLASS", "CONSEQUENCE","CODING_STATUS"), + only_colnames = F, quiet = T + ) + + consequence_stats <- + var_df |> + dplyr::mutate(CONSEQUENCE = stringr::str_replace_all( + .data$CONSEQUENCE, "(, [0-9A-Za-z_]{1,}){1,}$","" + )) |> + dplyr::group_by(.data$CONSEQUENCE) |> + dplyr::summarise( + N = dplyr::n(), + .groups = "drop" + ) |> + dplyr::arrange(dplyr::desc(.data$N)) + + if(NROW(consequence_stats) > 5) { + consequence_stats_top <- utils::head(consequence_stats, 4) + consequence_stats_other <- consequence_stats |> + dplyr::slice_tail(n = -4) |> + dplyr::summarise( + N = sum(.data$N), + CONSEQUENCE = "other_consequences" + ) + consequence_stats <- dplyr::bind_rows( + consequence_stats_top, consequence_stats_other) |> + dplyr::arrange(dplyr::desc(.data$N)) + } + + consequence_stats <- consequence_stats |> + dplyr::mutate(Pct = .data$N / sum(.data$N) * 100) + + consequence_stats_coding <- + var_df |> + dplyr::filter(.data$CODING_STATUS == "coding") + + if(NROW(consequence_stats_coding) > 0) { + consequence_stats_coding <- + consequence_stats_coding |> + dplyr::mutate(CONSEQUENCE = stringr::str_replace_all( + .data$CONSEQUENCE, "(, [0-9A-Za-z_]{1,}){1,}$","" + )) |> + dplyr::group_by(.data$CONSEQUENCE) |> + dplyr::summarise( + N = dplyr::n(), + .groups = "drop" + ) |> + dplyr::arrange(dplyr::desc(.data$N)) + + if(NROW(consequence_stats_coding) > 5) { + consequence_stats_coding_top <- utils::head(consequence_stats_coding, 4) + consequence_stats_coding_other <- consequence_stats_coding |> + dplyr::slice_tail(n = -4) |> + dplyr::summarise( + N = sum(.data$N), + CONSEQUENCE = "other_consequences" + ) + consequence_stats_coding <- dplyr::bind_rows( + consequence_stats_coding_top, + consequence_stats_coding_other) |> + dplyr::arrange(dplyr::desc(.data$N)) + } + + consequence_stats_coding <- + consequence_stats_coding |> + dplyr::mutate(Pct = .data$N / sum(.data$N) * 100) + } + + + variant_class_stats <- + var_df |> + dplyr::group_by(.data$VARIANT_CLASS) |> + dplyr::summarise( + N = dplyr::n(), + .groups = "drop" + ) |> + dplyr::mutate(Pct = .data$N / sum(.data$N) * 100) |> + dplyr::arrange(dplyr::desc(.data$Pct)) + + coding_stats <- + var_df |> + dplyr::group_by(.data$CODING_STATUS) |> + dplyr::summarise( + N = dplyr::n(), + .groups = "drop" + ) |> + dplyr::mutate(Pct = .data$N / sum(.data$N) * 100) |> + dplyr::arrange(dplyr::desc(.data$Pct)) + + result <- list() + result[['consequence']] <- consequence_stats + result[['consequence_coding']] <- consequence_stats_coding + result[['variant_class']] <- variant_class_stats + result[['coding']] <- coding_stats + + return(result) +} diff --git a/pcgrr/data-raw/data-raw.R b/pcgrr/data-raw/data-raw.R index 69fa2e9f..df0c9152 100755 --- a/pcgrr/data-raw/data-raw.R +++ b/pcgrr/data-raw/data-raw.R @@ -5,8 +5,8 @@ for (c in c("pathogenicity", "clinical_evidence", "cancer_assoc", "gene_expression", - "exp_increase", - "exp_decrease", + "expression_outlier_high", + "expression_outlier_low", "cna_variant_class", "tier", "report_color", @@ -116,16 +116,23 @@ for (c in c("pathogenicity", if (c == "success") { color_palette[[c]] <- "#00a65a" } - if (c == 'exp_increase'){ - color_palette[[c]][["breaks"]] <- c(92, 94, 96, 98) + if (c == 'expression_outlier_high'){ + color_palette[[c]][["breaks"]] <- c(96,97,98,99) color_palette[[c]][["values"]] <- - c("#caead2","#a4d9b3","#7dc995","#52b777","#00a65a") + c("#a1d99b", + "#74c476", + "#41ab5d", + "#238b45", + "#005a32") } - - if (c == 'exp_decrease'){ - color_palette[[c]][["breaks"]] <- c(2, 4, 6, 8) + if (c == 'expression_outlier_low'){ + color_palette[[c]][["breaks"]] <- c(1,2,3,4) color_palette[[c]][["values"]] <- - c("#CD534C","#dd766b","#ea978c","#f6b7ae","#ffd8d2") + c("#99000d", + "#cb181d", + "#ef3b2c", + "#fb6a4a", + "#fc9272") } } @@ -142,7 +149,14 @@ usethis::use_data( #-----input column names/types-----# data_coltype_defs <- list() -data_coltype_defs[['cna_somatic_raw']] <- readr::cols_only( +data_coltype_defs[['cna_somatic_segment_raw']] <- readr::cols_only( + CHROM = readr::col_character(), + SEGMENT_START = readr::col_double(), + SEGMENT_END = readr::col_double(), + SEGMENT_NAME = readr::col_character() +) + +data_coltype_defs[['cna_somatic_gene_raw']] <- readr::cols_only( CHROM = readr::col_character(), SEGMENT_START = readr::col_double(), SEGMENT_END = readr::col_double(), @@ -168,7 +182,7 @@ data_coltype_defs[['cna_somatic_raw']] <- readr::cols_only( TSG_RANK = readr::col_integer(), ONCOGENE = readr::col_logical(), ONCOGENE_SUPPORT = readr::col_character(), - ONCOGENE_RANK = readr::col_logical(), + ONCOGENE_RANK = readr::col_integer(), SEGMENT_LENGTH_MB = readr::col_number(), BIOMARKER_MATCH = readr::col_character(), SAMPLE_ID = readr::col_character()) @@ -196,9 +210,11 @@ data_coltype_defs[['snv_indel_somatic_raw']] <- readr::cols_only( NULL_VARIANT = readr::col_logical(), CODING_STATUS = readr::col_character(), EXONIC_STATUS = readr::col_character(), + ALTERATION = readr::col_character(), PROTEIN_CHANGE = readr::col_character(), HGVSp_short = readr::col_character(), HGVSc = readr::col_character(), + HGVSc_RefSeq = readr::col_character(), HGVSp = readr::col_character(), CDS_CHANGE = readr::col_character(), CDS_RELATIVE_POSITION = readr::col_character(), @@ -227,8 +243,8 @@ data_coltype_defs[['snv_indel_somatic_raw']] <- readr::cols_only( CCDS = readr::col_character(), CANONICAL = readr::col_character(), BIOTYPE = readr::col_character(), - TRANSCRIPT_MANE_SELECT = readr::col_character(), - TRANSCRIPT_MANE_PLUS_CLINICAL = readr::col_character(), + MANE_SELECT = readr::col_character(), + MANE_PLUS_CLINICAL = readr::col_character(), TSG = readr::col_logical(), TSG_RANK = readr::col_integer(), TSG_SUPPORT = readr::col_character(), @@ -298,9 +314,11 @@ data_coltype_defs[['snv_indel_germline_raw']] <- readr::cols_only( CODING_STATUS = readr::col_character(), EXONIC_STATUS = readr::col_character(), PROTEIN_CHANGE = readr::col_character(), + ALTERATION = readr::col_character(), HGVSp_short = readr::col_character(), HGVSc = readr::col_character(), HGVSp = readr::col_character(), + HGVSc_RefSeq = readr::col_character(), CDS_CHANGE = readr::col_character(), CDS_RELATIVE_POSITION = readr::col_character(), EXON = readr::col_character(), @@ -330,8 +348,8 @@ data_coltype_defs[['snv_indel_germline_raw']] <- readr::cols_only( CCDS = readr::col_character(), CANONICAL = readr::col_character(), BIOTYPE = readr::col_character(), - TRANSCRIPT_MANE_SELECT = readr::col_character(), - TRANSCRIPT_MANE_PLUS_CLINICAL = readr::col_character(), + MANE_SELECT = readr::col_character(), + MANE_PLUS_CLINICAL = readr::col_character(), TSG = readr::col_logical(), TSG_RANK = readr::col_integer(), TSG_SUPPORT = readr::col_character(), @@ -379,6 +397,10 @@ data_coltype_defs[['snv_indel_germline_raw']] <- readr::cols_only( DBNSFP_BAYESDEL_ADDAF = readr::col_character(), DBNSFP_SPLICE_SITE_ADA = readr::col_character(), DBNSFP_SPLICE_SITE_RF = readr::col_character(), + DBNSFP_PHACTBOOST = readr::col_character(), + DBNSFP_ALPHA_MISSENSE = readr::col_character(), + DBNSFP_MUTFORMER = readr::col_character(), + DBNSFP_ESM1B = readr::col_character(), gnomADe_AF = readr::col_number(), gnomADe_AMR_AF = readr::col_number(), gnomADe_AFR_AF = readr::col_number(), @@ -431,6 +453,48 @@ data_coltype_defs[['snv_indel_germline_raw']] <- readr::cols_only( GENOME_VERSION = readr::col_character() ) +data_coltype_defs[['snv_indel_germline_cpsr']] <- readr::cols_only( + GENOMIC_CHANGE = readr::col_character(), + GENOTYPE = readr::col_character(), + ALTERATION = readr::col_character(), + DP_CONTROL = readr::col_integer(), + VAR_ID = readr::col_character(), + VARIANT_CLASS = readr::col_character(), + CONSEQUENCE = readr::col_character(), + LOSS_OF_FUNCTION = readr::col_logical(), + CODING_STATUS = readr::col_character(), + PROTEIN_CHANGE = readr::col_character(), + HGVSc = readr::col_character(), + HGVSp = readr::col_character(), + HGVSc_RefSeq = readr::col_character(), + CDS_CHANGE = readr::col_character(), + PFAM_DOMAIN_NAME = readr::col_character(), + ONCOGENE = readr::col_logical(), + TUMOR_SUPPRESSOR = readr::col_logical(), + SYMBOL = readr::col_character(), + ENTREZGENE = readr::col_character(), + GENENAME = readr::col_character(), + ENSEMBL_GENE_ID = readr::col_character(), + ENSEMBL_TRANSCRIPT_ID = readr::col_character(), + REFSEQ_TRANSCRIPT_ID = readr::col_character(), + PFAM_DOMAIN = readr::col_character(), + PFAM_DOMAIN_NAME = readr::col_character(), + CLINVAR_MSID = readr::col_character(), + CLINVAR_PHENOTYPE = readr::col_character(), + CLINVAR_CLASSIFICATION = readr::col_character(), + CLINVAR_CONFLICTED = readr::col_logical(), + CLINVAR_REVIEW_STATUS_STARS = readr::col_integer(), + CLINVAR_VARIANT_ORIGIN = readr::col_character(), + DBSNP_RSID = readr::col_character(), + CPSR_CLASSIFICATION_SOURCE = readr::col_character(), + CPSR_CLASSIFICATION = readr::col_character(), + CPSR_PATHOGENICITY_SCORE = readr::col_double(), + CPSR_CLASSIFICATION_CODE = readr::col_character(), + FINAL_CLASSIFICATION = readr::col_character(), + SAMPLE_ID = readr::col_character(), + GENOME_VERSION = readr::col_character() +) + usethis::use_data(data_coltype_defs, overwrite = T) @@ -471,6 +535,7 @@ tsv_cols[['snv_indel']] <- 'ENTREZGENE', 'ENSEMBL_GENE_ID', 'GENENAME', + 'ALTERATION', 'PROTEIN_CHANGE', 'CONSEQUENCE', 'PFAM_DOMAIN_NAME', @@ -492,6 +557,7 @@ tsv_cols[['snv_indel']] <- 'ONCOGENICITY_CODE', 'ONCOGENICITY_SCORE', 'HGVSc', + 'HGVSc_RefSeq', 'HGVSp', 'CANONICAL', 'CCDS', @@ -500,8 +566,8 @@ tsv_cols[['snv_indel']] <- 'ENSEMBL_PROTEIN_ID', 'REFSEQ_TRANSCRIPT_ID', 'REFSEQ_PROTEIN_ID', - 'TRANSCRIPT_MANE_SELECT', - 'TRANSCRIPT_MANE_PLUS_CLINICAL', + 'MANE_SELECT', + 'MANE_PLUS_CLINICAL', 'CGC_TIER', 'CGC_GERMLINE', 'CGC_SOMATIC', @@ -528,6 +594,80 @@ tsv_cols[['snv_indel']] <- 'CALL_CONFIDENCE' ) +tsv_cols[['snv_indel_unfiltered']] <- + c('SAMPLE_ID', + 'GENOMIC_CHANGE', + 'GENOME_VERSION', + 'VARIANT_CLASS', + 'SOMATIC_CLASSIFICATION', + 'SYMBOL', + 'ENTREZGENE', + 'ENSEMBL_GENE_ID', + 'GENENAME', + 'ALTERATION', + 'PROTEIN_CHANGE', + 'CONSEQUENCE', + 'PFAM_DOMAIN_NAME', + 'LOSS_OF_FUNCTION', + 'LOF_FILTER', + 'CDS_CHANGE', + 'CODING_STATUS', + 'EXONIC_STATUS', + 'DP_TUMOR', + 'VAF_TUMOR', + 'MUTATION_HOTSPOT', + 'MUTATION_HOTSPOT_CANCERTYPE', + 'ACTIONABILITY_TIER', + 'ACTIONABILITY', + 'ACTIONABILITY_FRAMEWORK', + 'ONCOGENICITY', + 'ONCOGENICITY_CODE', + 'ONCOGENICITY_SCORE', + 'HGVSc', + 'HGVSc_RefSeq', + 'HGVSp', + 'CANONICAL', + 'CCDS', + 'UNIPROT_ACC', + 'ENSEMBL_TRANSCRIPT_ID', + 'ENSEMBL_PROTEIN_ID', + 'REFSEQ_TRANSCRIPT_ID', + 'REFSEQ_PROTEIN_ID', + 'MANE_SELECT', + 'MANE_PLUS_CLINICAL', + 'CGC_TIER', + 'CGC_GERMLINE', + 'CGC_SOMATIC', + 'ONCOGENE', + 'ONCOGENE_SUPPORT', + 'TUMOR_SUPPRESSOR', + 'TUMOR_SUPPRESSOR_SUPPORT', + 'TARGETED_INHIBITORS2', + 'EFFECT_PREDICTIONS', + 'REGULATORY_ANNOTATION', + 'VEP_ALL_CSQ', + 'gnomADe_AF', + 'gnomADe_AFR_AF', + 'gnomADe_AMR_AF', + 'gnomADe_ASJ_AF', + 'gnomADe_EAS_AF', + 'gnomADe_FIN_AF', + 'gnomADe_NFE_AF', + 'gnomADe_OTH_AF', + 'gnomADe_SAS_AF', + 'DBSNP_RSID', + 'COSMIC_ID', + 'TCGA_FREQUENCY', + 'TCGA_PANCANCER_COUNT', + 'CLINVAR_MSID', + 'CLINVAR_CLASSIFICATION', + 'CLINVAR_VARIANT_ORIGIN', + 'CLINVAR_NUM_SUBMITTERS', + 'CLINVAR_REVIEW_STATUS_STARS', + 'CLINVAR_CONFLICTED', + 'BIOMARKER_MATCH', + 'CALL_CONFIDENCE' + ) usethis::use_data(tsv_cols, overwrite = T) dt_display <- list() @@ -601,14 +741,15 @@ dt_display[['snv_indel_germline_filtered']] <- 'EXCLUSION_CRITERIA', 'SYMBOL', 'CONSEQUENCE', - 'gnomADe_AF', + 'HGVSc', + 'HGVSc_RefSeq', 'DBSNP_RSID', 'COSMIC_ID', 'TCGA_FREQUENCY', + 'CLINVAR_CLASSIFICATION', 'VAF_TUMOR', 'DP_TUMOR', - 'VAF_CONTROL', - 'DP_CONTROL', + 'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', @@ -633,7 +774,7 @@ dt_display[['snv_indel_gene_actionable']] <- 'VARIANT_CLASS', 'SYMBOL', 'HGVSc', - 'HGVSp', + 'HGVSc_RefSeq', 'PREDICTED_EFFECT', 'LOSS_OF_FUNCTION', 'LOF_FILTER', @@ -647,6 +788,7 @@ dt_display[['snv_indel_gene_actionable']] <- 'ENSEMBL_GENE_ID', 'ENSEMBL_TRANSCRIPT_ID', 'REFSEQ_TRANSCRIPT_ID', + 'MANE_SELECT', 'CANCERGENE_EVIDENCE', 'TARGETED_INHIBITORS', 'TARGETED_INHIBITORS_ALL', @@ -680,16 +822,17 @@ dt_display[['snv_indel_eitem']] <- dt_display[['snv_indel_tier3']] <- c('SYMBOL', - 'PROTEIN_CHANGE', + 'ALTERATION', 'GENENAME', 'CONSEQUENCE', 'ONCOGENICITY', 'PROTEIN_DOMAIN', 'MUTATION_HOTSPOT', 'COSMIC_ID', + 'PROTEIN_CHANGE', 'CDS_CHANGE', 'HGVSc', - 'HGVSp', + 'HGVSc_RefSeq', 'MUTATION_HOTSPOT_CANCERTYPE', 'LOSS_OF_FUNCTION', 'LOF_FILTER', @@ -700,8 +843,10 @@ dt_display[['snv_indel_tier3']] <- 'VEP_ALL_CSQ', 'DBSNP_RSID', 'CLINVAR', + 'CLINVAR_CLASSIFICATION', 'ENSEMBL_TRANSCRIPT_ID', 'REFSEQ_TRANSCRIPT_ID', + 'MANE_SELECT', 'TARGETED_INHIBITORS', 'TARGETED_INHIBITORS_ALL', 'ONCOGENE', @@ -720,16 +865,17 @@ dt_display[['snv_indel_tier3']] <- dt_display[['tier4']] <- c('SYMBOL', - 'PROTEIN_CHANGE', + 'ALTERATION', 'GENENAME', 'CONSEQUENCE', 'ONCOGENICITY', 'PROTEIN_DOMAIN', 'COSMIC_ID', + 'PROTEIN_CHANGE', 'CDS_CHANGE', 'TCGA_FREQUENCY', 'HGVSc', - 'HGVSp', + 'HGVSc_RefSeq', 'PREDICTED_EFFECT', 'LOSS_OF_FUNCTION', 'LOF_FILTER', @@ -742,6 +888,7 @@ dt_display[['tier4']] <- 'ENSEMBL_TRANSCRIPT_ID', 'ENSEMBL_PROTEIN_ID', 'REFSEQ_TRANSCRIPT_ID', + 'MANE_SELECT', 'CANCERGENE_EVIDENCE', 'CANCER_GENE_CENSUS', 'GLOBAL_ASSOC_RANK', @@ -757,34 +904,6 @@ dt_display[['tier4']] <- 'GENOMIC_CHANGE', 'GENOME_VERSION') -# dt_display[['tier5']] <- -# c('SYMBOL', -# 'GENENAME', -# 'CONSEQUENCE', -# 'COSMIC_ID', -# 'TCGA_FREQUENCY', -# 'DBSNP_RSID', -# 'CLINVAR', -# 'ENSEMBL_TRANSCRIPT_ID', -# 'REFSEQ_TRANSCRIPT_ID', -# 'CANCERGENE_EVIDENCE', -# 'CANCER_GENE_CENSUS', -# 'GLOBAL_ASSOC_RANK', -# 'TISSUE_ASSOC_RANK', -# 'REGULATORY_ANNOTATION', -# 'VEP_ALL_CSQ', -# 'CALL_CONFIDENCE', -# 'DP_TUMOR', -# 'VAF_TUMOR', -# 'DP_CONTROL', -# 'VAF_CONTROL', -# 'GENOMIC_CHANGE', -# 'GENOME_VERSION') -# - - - - usethis::use_data(dt_display, overwrite = T) #---- variant_db_url ----# @@ -835,6 +954,14 @@ variant_db_url <- rename = TRUE, link_display_var = "REFSEQ_TRANSCRIPT_ID_RAW", stringsAsFactors = F), + data.frame( + name = "MANE_SELECT", + group_by_var = "VAR_ID", + url_prefix = "https://www.ncbi.nlm.nih.gov/nuccore/", + link_key_var = "MANE_SELECT_RAW", + rename = TRUE, + link_display_var = "MANE_SELECT_RAW", + stringsAsFactors = F), data.frame( name = "ENSEMBL_TRANSCRIPT_ID", group_by_var = "VAR_ID", @@ -1004,10 +1131,9 @@ germline_filter_levels <- usethis::use_data(germline_filter_levels, overwrite = T) rm(cancer_phenotypes_regex, - data_coltype_defs, effect_prediction_algos, tcga_cohorts, - immune_celltypes, + immune_celltypes2, artefact_signatures, context_order, cosmic_signatures, @@ -1016,9 +1142,10 @@ rm(cancer_phenotypes_regex, cosmic_sbs_signatures, germline_filter_levels, sig, - biomarker_evidence, variant_db_url, color_palette, + data_coltype_defs, + biomarker_evidence, dt_display, tsv_cols, c) diff --git a/pcgrr/data-raw/effect_prediction_algorithms.tsv b/pcgrr/data-raw/effect_prediction_algorithms.tsv index 79c65a5a..0c569f58 100644 --- a/pcgrr/data-raw/effect_prediction_algorithms.tsv +++ b/pcgrr/data-raw/effect_prediction_algorithms.tsv @@ -19,4 +19,8 @@ gerp_rs http://mendel.stanford.edu/SidowLab/downloads/gerp/ GERP++ RS score list_s2 https://doi.org/10.1093/nar/gkaa288 LIST-S2 bayesdel_addaf https://doi.org/10.1002/humu.23158 BayesDel aloft http://aloft.gersteinlab.org/ ALoFT +esm1b https://huggingface.co/spaces/ntranoslab/esm_variants/tree/main ESM1b +alphamissense https://console.cloud.google.com/storage/browser/dm_alphamissense AlphaMissense +mutformer https://github.com/WGLab/mutformer MutFormer +phactboost https://github.com/CompGenomeLab/PHACTboost PHACTboost metarnn http://www.liulab.science/metarnn.html MetaRNN diff --git a/pcgrr/data/color_palette.rda b/pcgrr/data/color_palette.rda index 7d4a5377..56985d4a 100644 Binary files a/pcgrr/data/color_palette.rda and b/pcgrr/data/color_palette.rda differ diff --git a/pcgrr/data/data_coltype_defs.rda b/pcgrr/data/data_coltype_defs.rda index abcf42e2..6403d6e5 100644 Binary files a/pcgrr/data/data_coltype_defs.rda and b/pcgrr/data/data_coltype_defs.rda differ diff --git a/pcgrr/data/dt_display.rda b/pcgrr/data/dt_display.rda index 345f74b9..b74fb1c9 100644 Binary files a/pcgrr/data/dt_display.rda and b/pcgrr/data/dt_display.rda differ diff --git a/pcgrr/data/effect_prediction_algos.rda b/pcgrr/data/effect_prediction_algos.rda index 02f02103..3da6573f 100644 Binary files a/pcgrr/data/effect_prediction_algos.rda and b/pcgrr/data/effect_prediction_algos.rda differ diff --git a/pcgrr/data/tcga_cohorts.rda b/pcgrr/data/tcga_cohorts.rda index 9cadc66e..5406daaf 100644 Binary files a/pcgrr/data/tcga_cohorts.rda and b/pcgrr/data/tcga_cohorts.rda differ diff --git a/pcgrr/data/tsv_cols.rda b/pcgrr/data/tsv_cols.rda index 89cfba52..b6dd0e21 100644 Binary files a/pcgrr/data/tsv_cols.rda and b/pcgrr/data/tsv_cols.rda differ diff --git a/pcgrr/data/variant_db_url.rda b/pcgrr/data/variant_db_url.rda index fc707a56..19b9beaf 100644 Binary files a/pcgrr/data/variant_db_url.rda and b/pcgrr/data/variant_db_url.rda differ diff --git a/pcgrr/inst/templates/pcgr_quarto_report.qmd b/pcgrr/inst/templates/pcgr_quarto_report.qmd index 123e3759..05be8c5b 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report.qmd @@ -85,6 +85,14 @@ if(as.logical(pcg_report$settings$conf$assay_properties$vcf_tumor_only) == TRUE) #| eval: !expr as.logical(pcg_report$content$mutational_signatures$eval) == TRUE ``` +
+ +```{r pcgr_kataegis} +#| child: 'pcgr_quarto_report/kataegis.qmd' +#| eval: !expr as.logical(pcg_report$content$kataegis$eval) == TRUE +``` + +
```{r pcgr_msi} @@ -92,12 +100,23 @@ if(as.logical(pcg_report$settings$conf$assay_properties$vcf_tumor_only) == TRUE) #| eval: !expr as.logical(pcg_report$content$msi$eval) == TRUE ``` +
+ ```{r conditional_tmb} #| child: 'pcgr_quarto_report/mutational_burden.qmd' #| eval: !expr as.logical(pcg_report$content$tmb$eval) == TRUE ``` -```{r conditional_expresson} +
+ +```{r pcgr_germline} +#| child: 'pcgr_quarto_report/germline.qmd' +#| eval: !expr as.logical(pcg_report$content$germline_classified$eval) == TRUE +``` + + + +```{r conditional_expression} #| child: 'pcgr_quarto_report/expression.qmd' #| eval: !expr as.logical(pcg_report$content$expression$eval) == TRUE ``` diff --git a/pcgrr/inst/templates/pcgr_quarto_report/cna.qmd b/pcgrr/inst/templates/pcgr_quarto_report/cna.qmd index 668a277c..56f37c27 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/cna.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/cna.qmd @@ -1,5 +1,32 @@ ## Somatic CNAs +
+ +```{r plot_cna} +#| eval: !expr NROW(pcg_report$content$cna$callset$variant) > 0 +#| echo: false +#| output: asis +#| fig.width: 15 +#| fig.height: 10 + +cna_plot <- pcgrr::plot_cna_segments( + chrom_coordinates = pcg_report$settings$chrom_coordinates, + cna_segment = pcg_report$content$cna$callset$segment, + cna_gene = pcg_report$content$cna$callset$variant +) + +p_margin <- + list(l = 50, r = 0, b = 0, t = 0, pad = 0) + +plotly::ggplotly(cna_plot, tooltip = c("y","z")) |> + plotly::layout( + legend = list(orientation = "h", x = 0.34, y = -0.28), + margin = p_margin) + +``` + +
+ ```{r check_expression} #| eval: true @@ -324,7 +351,7 @@ htmltools::br() Here we show other genes subject to copy number alteration that may potentially have oncogenic effects. Specifically, we list - * [Proto-oncogenes](#documentation) subject to copy-number gain (total copy number >= __`r pcg_report$settings$conf$somatic_cna$n_copy_gain`__) + * [Proto-oncogenes](#documentation) subject to copy-number gain (total copy number >= __`r pcg_report$settings$conf$somatic_cna$n_copy_gain`__ (gain threshold, user-configurable)) * [Tumor suppressor](#documentation) genes subject to homozygous deletions (total copy number = __0__) Gene symbols are color-coded according to their strength of association to cancer (here, specifically __`r pcg_report$settings$conf$sample_properties$site`__ cancers), from   weak associations     to   strong associations   [@Ochoa2021-jc;@Nakken2023-ol]. diff --git a/pcgrr/inst/templates/pcgr_quarto_report/documentation.qmd b/pcgrr/inst/templates/pcgr_quarto_report/documentation.qmd index ebd8cea9..5ce9603e 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/documentation.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/documentation.qmd @@ -1,9 +1,8 @@ ## Documentation -This report is intended for interpretation of acquired genomic aberrations detected in human tumor samples, as detected from high-throughput sequencing -technologies. +This report is intended for interpretation of multi-omics data from a single human tumor sample. -

+
::: {.panel-tabset} @@ -82,31 +81,29 @@ for(i in 1:NROW(ref_datasets)){ #### Oncogenicity classification -Somatic aberrations (SNV/InDels) are evaluated for oncogenicity through an implementation of standard operating procedures proposed by VICC/CGC/ClinGen [@Horak2022-uh]. Here, various properties of the variants and genes affected are assigned specific scores according to several criteria, both negative and positive, pending on whether the properties support an oncogenic or benign variant type. These scores are in turn aggregated towards an overall oncogenicity score. +Somatic aberrations (SNV/InDels) are evaluated for oncogenicity through an implementation of standard operating procedures proposed by ClinGen/CGC/VICC [@Horak2022-uh]. Here, various properties of the variants and genes affected are assigned specific scores according to several criteria, both negative and positive, pending on whether the properties support an oncogenic or benign variant type. These scores are in turn aggregated towards an overall oncogenicity score. -Note that all properties/criteria provided in the SOP's are _not_ readily implemented in PCGR, specifically the ones requiring manual curation or expert review (i.e. experimental oncogenic variant evidence, requring support from _in vitro_ or _in vivo_ functional studies (criteria _OM1/OS1_)). +Note that all properties/criteria provided in the SOP's are _not_ readily implemented in PCGR, specifically the ones requiring manual curation or expert review (i.e. experimental oncogenic variant evidence, requiring support from _in vitro_ or _in vivo_ functional studies (criteria _OM1/OS1_)). This implies that some variants interrogated by PCGR may not be classified as oncogenic, even though they could be classified as such by manual review. Considering the nature of our current implementation, we have thus also adopted slightly different score thresholds for variant classifications to those proposed originally by [@Horak2022-uh]. We are working to further improve the oncogenicity classification in PCGR, and welcome feedback on this matter. Note also that for somatic copy number aberrations, we showcase potential oncogenic events as **proto-oncogenes subject to amplifications** (where level of amplification is configurable by the user), as well as **tumor suppressor genes subject to homozygous deletions**. #### Actionability classificaton -Clinical actionability assessment of SNVs/InDels and gene copy number aberrations found in the tumor sample follows recommendation guidelines by AMP/ASCO/CAP [@Li2017-ew]. Specifically, different levels of actionability are implemented in the following manner: +Clinical actionability assessment of SNVs/InDels and gene copy number aberrations found in the tumor sample implements recommendation guidelines by AMP/ASCO/CAP [@Li2017-ew]. Specifically, different levels of actionability are implemented in the following manner: -- **TIER I: Variants of strong clinical significance** - constitutes aberrations linked to predictive, prognostic, or diagnostic biomarkers in the [CIViC database](https://civicdb.org) and the [Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) that are +- **Tier I: Variants of strong clinical significance** - constitutes aberrations linked to predictive, prognostic, or diagnostic biomarkers in the [CIViC database](https://civicdb.org) and the [Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) that are - Found within the same tumor type/class as specified by the user, **AND** - - Of strong clinical evidence (i.e. part of guidelines, validated or discovered in late clinical trials ([CIViC evidence levels A/B](https://civic.readthedocs.io/en/latest/model/evidence/level.html))) + - Of strong clinical evidence (i.e. approved therapies, part of guidelines, validated or discovered in late clinical trials ([CIViC evidence levels A/B](https://civic.readthedocs.io/en/latest/model/evidence/level.html))) - - **TIER II: Variants of potential clinical significance** - constitutes other aberrations linked to predictive, prognostic, or diagnostic biomarkers in the [CIViC database](https://civicdb.org) and the [Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) that are either + - **Tier II: Variants of potential clinical significance** - constitutes other aberrations linked to predictive, prognostic, or diagnostic biomarkers in the [CIViC database](https://civicdb.org) and the [Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) that are either - Of strong clinical evidence in other tumor types/classes than the one specified by the user, **OR** - Of weak clinical evidence (early trials, case reports etc. ([CIViC evidence levels C/D/E](https://civic.readthedocs.io/en/latest/model/evidence/level.html)))) in the same tumor type/class as specified by the user - - **TIER III: Variants of uncertain clinical significance (SNVs/InDels only)** - - - Other coding variants found in oncogenes or tumor suppressor genes, yet _not_ linked to any known predictive, prognostic, or diagnostic biomarkers in the [CIViC database](https://civicdb.org) and the [Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) - -In PCGR, we skip the classification of variants into the AMP/ASCO/CAP-specified *Tier IV* (benign/likely benign variants), but rather take a more cautious approach. Specifically, for SNVs/indels that do not fall into tier I, II, or III, we classify them into *Tier V: Other coding variants*, which includes other protein-coding variants, as well as *Tier VI: Other non-coding variants*, which includes synonymous variants, intronic variants, and other variants in non-coding regions. + - **Tier III: Variants of uncertain clinical significance (SNVs/InDels only)** - + - Other coding variants, not observed at significant allele frequencies (gnomAD MAF < 0.001), found in oncogenes or tumor suppressor genes, yet _not_ linked to any known predictive, prognostic, or diagnostic biomarkers in the [CIViC database](https://civicdb.org) and the [Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) -
+In PCGR, we skip the classification of variants into the AMP/ASCO/CAP-specified *Tier IV* (benign/likely benign variants), but rather take a more cautious approach. Specifically, for SNVs/indels that do not fall into tier I, II, or III, we classify them into *Tier V: Other coding variants*, which includes protein-coding variants in non-cancer related genes, as well as *Tier VI: Other non-coding variants*, which includes synonymous variants, intronic variants, and other variants in non-coding regions. #### Mutational signatures @@ -122,21 +119,17 @@ Tumor mutational load or mutational burden is a measure of the number of mutatio 2) __TMB_coding_non_silent__: counting all somatic base substitutions and indels in the protein-coding regions of the sequencing assay, including those at canonical splice sites, but excluding synonymous alterations. 3) __TMB_missense_only__: missense (non-synonymous) variants variants only, i.e. as employed by [Fernandez et al., 2019](https://www.ncbi.nlm.nih.gov/pubmed/31475242) -Numbers obtained with 1), 2) or 3) are next divided by the coding target size of the sequencing assay. - -
+Numbers obtained with 1), 2) or 3) are next divided by the coding target size of the sequencing assay. We encourage users to provide accurate estimates of the target size of the sequencing assay. If the users utilize VAF/DP filtering for variants included in the TMB calculation, the same cutoffs/thresholds (DP) should ideally be applied to the target size estimation. #### MSI classification -Microsatellite instability (MSI) is the result of impaired DNA mismatch repair and constitutes a cellular phenotype of clinical significance in many cancer types, most prominently colorectal cancers, stomach cancers, endometrial cancers, and ovarian cancers ([Cortes-Ciriano et al., 2017](https://www.ncbi.nlm.nih.gov/pubmed/28585546)). We have built a statistical MSI classifier from somatic mutation profiles that separates _MSI.H_ (MSI-high) from _MSS_ (MS stable) tumors. The MSI classifier was trained using 999 exome-sequenced TCGA tumor samples with known MSI status (i.e. assayed from mononucleotide markers), and obtained a [positive predictive value](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values#Positive_predictive_value) of 100% and a [negative predictive value](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values#Negative_predictive_value) of 99.4% on an independent test set of 427 samples. Details of the MSI classification approach can be found here. - -Note that the MSI classifier is applied only for WGS/WES tumor-control sequencing assays. +Microsatellite instability (MSI) is the result of impaired DNA mismatch repair and constitutes a cellular phenotype of clinical significance in many cancer types, most prominently colorectal cancers, stomach cancers, endometrial cancers, and ovarian cancers ([Cortes-Ciriano et al., 2017](https://www.ncbi.nlm.nih.gov/pubmed/28585546)). We have built a statistical MSI classifier from somatic mutation profiles that separates _MSI.H_ (MSI-high) from _MSS_ (MS stable) tumors. The MSI classifier was trained using __N = 1,065__ exome-sequenced TCGA tumor samples with known MSI status (i.e. assayed from mononucleotide markers), and obtained a [positive predictive value](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values#Positive_predictive_value) of 97.9% and a [negative predictive value](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values#Negative_predictive_value) of 99.4% on an independent test set of __N = 435__ samples. Details of the MSI classification approach can be found here. -
+Note that, given the nature of the training dataset, the MSI classifier can only be applied for samples originating from WGS/WES tumor-control sequencing assays (i.e. _not_ for tumor-only settings). -#### Expression +#### Gene expression -If data is provided, PCGR reports may include a section on gene expression, where we provide an analysis of expression outliers with respect to reference cohorts (TCGA). The expression levels are based on bulk RNA-seq data from the tumor sample, and are presented as normalized expression values (TPM). We also perform a correlation analysis of the gene expression profile of the input sample with the profiles seen in other reference collections (The Cancer Genome Atlas, DepMap cell lines, TreeHouse pediactric cancers). +If data is provided, PCGR reports may include a section on gene expression, where we provide an analysis of expression outliers with respect to reference cohorts (TCGA). The expression levels are based on bulk RNA-seq data from the tumor sample, and are presented as normalized expression values (TPM). We also perform a correlation analysis of the gene expression profile of the input sample with the profiles seen in other reference collections (The Cancer Genome Atlas, DepMap cell lines, TreeHouse pediactric cancers), and estimate the fractions of immune cell types in the tumor sample. ::: diff --git a/pcgrr/inst/templates/pcgr_quarto_report/expression.qmd b/pcgrr/inst/templates/pcgr_quarto_report/expression.qmd index ce83ce60..f7b05571 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/expression.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/expression.qmd @@ -7,7 +7,7 @@ ``` -

+
```{r expression_outliers} #| eval: !expr as.logical(NROW(pcg_report$content$expression$outliers) > 0) == TRUE @@ -16,7 +16,7 @@ ``` -

+
```{r expression_immune_contexture} #| eval: !expr as.logical(NROW(pcg_report$content$expression$immune_contexture) > 0) == TRUE diff --git a/pcgrr/inst/templates/pcgr_quarto_report/expression/expression_outliers.qmd b/pcgrr/inst/templates/pcgr_quarto_report/expression/expression_outliers.qmd index 8fdcb8e1..171132b5 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/expression/expression_outliers.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/expression/expression_outliers.qmd @@ -1,16 +1,26 @@ ### Gene expression outliers -Here, we show the gene expression outliers identified in the RNA sample, as compared to the selected reference cohort. Expression outliers are defined as genes with expression levels located at the extreme ends of the distributions observed in the reference cohort, specifically those with: +Here, we show the protein-coding gene expression outliers identified in the RNA input sample, as compared to the selected reference cohort. We consider expression outliers as genes with expression levels (specifically log-transformed TPM values, see below) located at the extreme ends of the distributions observed in the reference cohort, specifically those with: -* A TPM in the top or bottom percentiles (i.e. above 90 or below 10), and either -* A TPM that is more than 1.5 * IQR above the third quartile (increased expression) of the distribution seen in the reference cohort -* A TPM that is less than 1.5 * IQR below the first quartile (decreased expression) of the distribution seen in the reference cohort +1) An expression level in the top or bottom percentiles (i.e. above 95 or below 5), _and_ +2) An expression level that is more than $Q3 + 1.5 \times IQR$ (increased expression) or less than $Q1 - 1.5 \times IQR$ (decreased expression), _and_ +3) An expression level with a Z-score above 2 or below -2 -__TPM__: Transcripts per Million, +__TPM__: Transcripts per Million __IQR__: Interquartile range +__Q1__: First quartile +__Q3__: Third quartile -We only consider protein-coding genes, and limit the ones shown to cancer-associated genes (including -tumor suppressors, oncogenes, drivers etc), as well as targets of approved cancer drugs. +Outlier genes shown here are furthermore limited to the ones that have some evidence of being cancer-associated (including, yet not limited to, tumor suppressors, oncogenes, and predicted cancer driver genes), as well as targets of approved cancer drugs. Maximum __N = 1,000__ genes are listed here for each category (increased/reduced expression) + +::: {.callout-note} + +## Note - gene expression unit + +In the table below, column __EXPR_LEVEL__ indicates the expression level of a gene in the input sample as +_log2(TPM + 0.001)_ + +::: ::: {.callout-warning} @@ -22,12 +32,76 @@ A comparison of gene expression levels identified in the input sample to those f
-```{r expression_outliers} -#| eval: !expr as.logical(NROW(pcg_report$content$expression$outliers) > 0) == TRUE +```{r check_expression_outliers_data} +#| eval: true +#| output: asis + +outliers_high_present <- F +outliers_high <- data.frame() +outliers_low_present <- F +outliers_low <- data.frame() + +if(!is.null(pcg_report$content$expression$outliers)){ + if(NROW(pcg_report$content$expression$outliers) > 0){ + + outliers_high <- pcg_report$content$expression$outliers |> + dplyr::filter(.data$EXPR_OUTLIER == "Increased expression") |> + dplyr::arrange( + dplyr::desc(.data$Z_SCORE) + ) |> + head(1000) + + if(NROW(outliers_high) > 0){ + outliers_high_present <- T + } + + outliers_low <- pcg_report$content$expression$outliers |> + dplyr::filter(.data$EXPR_OUTLIER == "Reduced expression") |> + dplyr::arrange( + dplyr::desc(abs(.data$Z_SCORE)) + ) |> + head(1000) + + if(NROW(outliers_low) > 0){ + outliers_low_present <- T + } + } + +} + + +``` + + +::: {.panel-tabset} + +```{r outliers_high_expression} #| output: asis +#| eval: !expr outliers_high_present == TRUE + +source_name <- 'Outliers - increased expression' +cat('\n\n#### ',source_name,'\n\n
') + +outliers_high_shared <- + crosstalk::SharedData$new( + outliers_high) + + +filter_set1 <- list() +filter_set2 <- list() + +filter_set1[[1]] <- crosstalk::filter_select( + "SYMBOL", "Gene", outliers_high_shared, ~SYMBOL) +filter_set2[[1]] <- crosstalk::filter_slider( + "Z_SCORE", "Z-score", outliers_high_shared, ~Z_SCORE) + +crosstalk::bscols( + filter_set1, + filter_set2 +) DT::datatable( - pcg_report$content$expression$outliers, + outliers_high_shared, escape = F, extensions = c("Buttons","Responsive"), options = list( @@ -44,17 +118,74 @@ DT::datatable( ) ) |> DT::formatStyle( - "TPM_GENE", + "EXPR_OUTLIER", "PERCENTILE", color = "white", backgroundColor = DT::styleInterval( - pcgrr::color_palette$exp_increase$breaks, - pcgrr::color_palette$exp_increase$values + pcgrr::color_palette$expression_outlier_high$breaks, + pcgrr::color_palette$expression_outlier_high$values ) - ) + ) +``` + + +```{r outliers_low_expression} +#| output: asis +#| eval: !expr outliers_low_present == TRUE + +source_name <- 'Outliers - reduced expression' +cat('\n\n#### ',source_name,'\n\n
') + +outliers_low_shared <- + crosstalk::SharedData$new( + outliers_low) + +filter_set1 <- list() +filter_set2 <- list() + +filter_set1[[1]] <- crosstalk::filter_select( + "SYMBOL", "Gene", outliers_low_shared, ~SYMBOL) +filter_set2[[1]] <- crosstalk::filter_slider( + "Z_SCORE", "Z-score", outliers_low_shared, ~Z_SCORE) + +crosstalk::bscols( + filter_set1, + filter_set2 +) + +DT::datatable( + outliers_low_shared, + escape = F, + extensions = c("Buttons","Responsive"), + options = list( + buttons = c('csv','excel'), + dom = 'Bfrtip')) |> + DT::formatStyle( + "SYMBOL", + "GLOBAL_ASSOC_RANK", + color = "white", + backgroundColor = + DT::styleInterval( + pcgrr::color_palette$cancer_assoc$breaks, + pcgrr::color_palette$cancer_assoc$values + ) + ) |> + DT::formatStyle( + "EXPR_OUTLIER", + "PERCENTILE", + color = "white", + backgroundColor = + DT::styleInterval( + pcgrr::color_palette$expression_outlier_low$breaks, + pcgrr::color_palette$expression_outlier_low$values + ) + ) + ``` +::: +

diff --git a/pcgrr/inst/templates/pcgr_quarto_report/expression/expression_similarity.qmd b/pcgrr/inst/templates/pcgr_quarto_report/expression/expression_similarity.qmd index 44447a27..9d7df4ce 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/expression/expression_similarity.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/expression/expression_similarity.qmd @@ -1,6 +1,6 @@ ### Similarity to other tumor samples -Here we show how the gene expression profile of the input RNA sample correlates with samples harvested from other sources. The similarity is calculated using the Spearman's correlation (protein-coding genes only). The tables below show the correlation coefficient for each (external) sample, along with the corresponding diagnosis, tissue/site, sample type, age category etc. The tables are sorted by the correlation coefficient in descending order, and only the top 500 samples are shown per source. +Here we show how the gene expression profile of the input RNA sample correlates with samples harvested from other sources. The similarity is calculated using the Spearman's correlation (protein-coding genes only). The tables below show the correlation coefficient for each (external) sample, along with the corresponding diagnosis, tissue/site, sample type, age category etc. The tables are sorted by the correlation coefficient in descending order, and only the top 200 samples are shown per source. ::: {.panel-tabset} @@ -8,7 +8,7 @@ Here we show how the gene expression profile of the input RNA sample correlates #| output: asis #| eval: !expr as.logical(TRUE %in% stringr::str_detect(names(pcg_report[['content']]$expression[['similarity_analysis']]), 'tcga')) == TRUE -source_name <- 'The Cancer Genome Atlas (TCGA)' +source_name <- 'Primary tumors - TCGA' cat('\n\n#### ',source_name,'\n\n
') dt_similarity_tcga <- data.frame() @@ -20,6 +20,12 @@ for(cohort in names(pcg_report[['content']]$expression[['similarity_analysis']]) pcg_report[['content']]$expression[['similarity_analysis']][[cohort]] |> dplyr::mutate(CORR = round(CORR, digits = 5)) |> dplyr::select(-c("EXT_SAMPLE_ID2","PROTEIN_CODING_ONLY")) |> + dplyr::mutate( + EXT_SAMPLE_NAME = + glue::glue( + "{EXT_SAMPLE_NAME}") + ) |> + dplyr::select(-c("EXT_LINK")) |> dplyr::select( EXT_SAMPLE_ID, EXT_SAMPLE_NAME, @@ -40,14 +46,27 @@ for(cohort in names(pcg_report[['content']]$expression[['similarity_analysis']]) dt_similarity_tcga <- dt_similarity_tcga |> dplyr::arrange(desc(CORR)) |> - head(500) + head(200) + +highlighted_site <- pcg_report$settings$conf$sample_properties$site +all_sites <- unique(dt_similarity_tcga$EXT_PRIMARY_SITE) +other_sites <- all_sites[ !all_sites == highlighted_site] DT::datatable( dt_similarity_tcga, escape = F, options = list(pageLength = 10), - extensions=c("Buttons","Responsive")) - + extensions=c("Buttons","Responsive")) |> + DT::formatStyle( + c('EXT_PRIMARY_SITE'), + fontWeight = 'bold', + color = "white", + backgroundColor = DT::styleEqual( + c(highlighted_site, other_sites), + c(pcg_report$settings$conf$report_color, + rep(pcgrr::color_palette$none, length(other_sites))) + ) + ) ``` @@ -56,7 +75,7 @@ DT::datatable( #| output: asis #| eval: !expr as.logical("depmap" %in% names(pcg_report[['content']]$expression[['similarity_analysis']])) == TRUE -source_name <- 'Cancer Dependency Map - DepMap' +source_name <- 'Cell lines - DepMap' cat('\n\n#### ',source_name,'\n\n
') dt_similarity_depmap <- @@ -66,8 +85,9 @@ dt_similarity_depmap <- dplyr::mutate( EXT_SAMPLE_NAME = glue::glue( - "{EXT_SAMPLE_NAME}") + "{EXT_SAMPLE_NAME}") ) |> + dplyr::select(-c("EXT_LINK")) |> dplyr::select( EXT_SAMPLE_ID, EXT_SAMPLE_NAME, @@ -80,13 +100,28 @@ dt_similarity_depmap <- dplyr::everything() ) |> dplyr::arrange(desc(CORR)) |> - head(500) + head(200) + +highlighted_site <- pcg_report$settings$conf$sample_properties$site +all_sites <- unique(dt_similarity_depmap$EXT_PRIMARY_SITE) +other_sites <- all_sites[ !all_sites == highlighted_site] DT::datatable( dt_similarity_depmap, escape = F, options = list(pageLength = 10), - extensions=c("Buttons","Responsive")) + extensions=c("Buttons","Responsive")) |> + DT::formatStyle( + c('EXT_PRIMARY_SITE'), + fontWeight = 'bold', + color = "white", + backgroundColor = DT::styleEqual( + c(highlighted_site, other_sites), + c(pcg_report$settings$conf$report_color, + rep(pcgrr::color_palette$none, length(other_sites))) + ) + ) + ``` @@ -113,13 +148,27 @@ dt_similarity_treehouse <- dplyr::everything() ) |> dplyr::arrange(desc(CORR)) |> - head(500) + head(200) + +highlighted_site <- pcg_report$settings$conf$sample_properties$site +all_sites <- unique(dt_similarity_treehouse$EXT_PRIMARY_SITE) +other_sites <- all_sites[ !all_sites == highlighted_site] DT::datatable( dt_similarity_treehouse, escape = F, options = list(pageLength = 10), - extensions=c("Buttons","Responsive")) + extensions=c("Buttons","Responsive")) |> + DT::formatStyle( + c('EXT_PRIMARY_SITE'), + fontWeight = 'bold', + color = "white", + backgroundColor = DT::styleEqual( + c(highlighted_site, other_sites), + c(pcg_report$settings$conf$report_color, + rep(pcgrr::color_palette$none, length(other_sites))) + ) + ) ``` diff --git a/pcgrr/inst/templates/pcgr_quarto_report/germline.qmd b/pcgrr/inst/templates/pcgr_quarto_report/germline.qmd index 267ece26..ef599c08 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/germline.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/germline.qmd @@ -1,153 +1,63 @@ -## Germline findings (CPSR) - -* Based on a germline report of the query case using the [Cancer Predisposition Sequencing Reporter (CPSR)](https://github.com/sigven/cpsr), we here list the main findings from the report with respect -to variants in cancer predisposition genes -* Pathogenic, likely pathogenic variants, and variants of uncertain signinificance (VUS) are listed, -for variants with existing classifications in ClinVar, and for _novel_ variants (not found in ClinVar) - -```{r prepare_cpsr_data, echo = F, eval = T} - -cpsr_rep <- pcg_report[['content']][['cpsr']][['report']] - -class543_germline_variants <- dplyr::bind_rows( - cpsr_rep$content$snv_indel$disp$class5, - cpsr_rep$content$snv_indel$disp$class4, - cpsr_rep$content$snv_indel$disp$class3) - -cpsr_clinvar_variants <- data.frame() -cpsr_other_variants <- data.frame() - -if(NROW(class543_germline_variants) > 0){ - - class543_germline_variants <- class543_germline_variants |> - dplyr::mutate(GENOTYPE = as.character(GENOTYPE)) - - cpsr_clinvar_variants <- class543_germline_variants |> - dplyr::filter(CPSR_CLASSIFICATION_SOURCE == "ClinVar") - - if(NROW(cpsr_clinvar_variants) > 0){ - cpsr_clinvar_variants <- cpsr_clinvar_variants |> - dplyr::select(-c(CPSR_CLASSIFICATION,CPSR_PATHOGENICITY_SCORE, - CPSR_CLASSIFICATION_DOC,CPSR_CLASSIFICATION_CODE)) |> - dplyr::select(SYMBOL, CLINVAR_CLASSIFICATION, - dplyr::everything()) - } - - - cpsr_other_variants <- class543_germline_variants |> - dplyr::filter(CPSR_CLASSIFICATION_SOURCE == "Other") - - if(NROW(cpsr_other_variants) > 0){ - cpsr_other_variants <- cpsr_other_variants |> - dplyr::select(-c(CLINVAR, - CLINVAR_REVIEW_STATUS_STARS, - CLINVAR_VARIANT_ORIGIN, - CLINVAR_CLASSIFICATION, - CLINVAR_CONFLICTED, - CLINVAR_PHENOTYPE)) |> - dplyr::select(SYMBOL, CPSR_CLASSIFICATION, - dplyr::everything()) - } - -} - -cpsr_biomarker_eitems <- data.frame() - -for(n in c('diagnostic','prognostic','predictive','predisposing')){ - if(NROW(cpsr_rep[['content']][['snv_indel']][['clin_eitem']][[n]]) > 0){ - cpsr_biomarker_eitems <- cpsr_biomarker_eitems |> - dplyr::bind_rows( - cpsr_rep[['content']][['snv_indel']][['clin_eitem']][[n]] - ) |> - dplyr::select(SYMBOL, GENE_NAME, BM_CANCER_TYPE, BM_EVIDENCE_TYPE, - BM_EVIDENCE_LEVEL, dplyr::everything()) +## Germline findings + +```{r prepare_panel_url} +#| echo: false +#| results: asis + +panel_link <- pcg_report[['content']][['germline_classified']][['panel_info']][['description']] +if(pcg_report[['content']][['germline_classified']][['panel_info']][['panel_id']] != "-1" & + !stringr::str_detect(pcg_report[['content']][['germline_classified']][['panel_info']][['url']], ",")){ + description <- pcg_report[['content']][['germline_classified']][['panel_info']][['description']] + description_trait <- + pcg_report[['content']][['germline_classified']][['panel_info']][['description_trait']] + url_raw <- pcg_report[['content']][['germline_classified']][['panel_info']][['url']] + description_full <- paste0(description,': ', description_trait) + if(pcg_report[['content']][['germline_classified']][['panel_info']][['panel_id']] == "0"){ + description_full <- description } + panel_link <- paste0("", + description_full, + "") } -n_biomarkers <- cpsr_biomarker_eitems |> NROW() - -tag_gnomad <- cpsr_rep[['metadata']][['config']][['popgen']][['vcftag_gnomad']] -desc_gnomad <- cpsr_rep[['metadata']][['config']][['popgen']][['popdesc_gnomad']] -formula_gnomad <- as.formula(paste0("~",rlang::sym(tag_gnomad))) - - -``` - -

CPSR - sample metadata

- -* Sample name  :   `r cpsr_rep[['metadata']][['sample_name']]`  -
- -

CPSR - virtual gene panel

- -Cancer predisposition geneset subject to analysis/screening in the CPSR report: - - * __[`r cpsr_rep[['metadata']][['gene_panel']][['name']]`](`r cpsr_rep[['metadata']][['gene_panel']][['url']]`)__ - * Diagnostic-grade genes (applicable to Genomics England panels only): __`r cpsr_rep[['metadata']][['config']][['diagnostic_grade_only']]`__ - * `r cpsr_rep[['metadata']][['gene_panel']][['confidence']]` - -```{r cpsr_gene_selection, echo = F, eval = T} -tiles_html <- pcgrr::virtual_panel_display_html(gene_df = cpsr_rep[['metadata']][['gene_panel']][['genes']]) ``` -`r tiles_html` -

+* Based on a germline variant analysis of the query case using the [Cancer Predisposition Sequencing Reporter (CPSR)](https://github.com/sigven/cpsr), we here list the variants of clinical significance in cancer predisposition genes, both novel (not recorded in ClinVar), and those with existing classifications in ClinVar. + * Virtual panel of cancer predisposition genes screened: `r panel_link` + * Variants of uncertain significance shown: __`r !pcg_report$settings$conf$germline$ignore_vus`__ -

CPSR - ClinVar variants

-* For sample __`r cpsr_rep[['metadata']][['sample_name']]`__, a total of n = __`r NROW(cpsr_clinvar_variants)`__ **germline variants** are registered with a Pathogenic/Likely pathogenic/VUS clinical significance in ClinVar. - - -```{r cpsr_clinvar_findings, echo=F, results = 'asis', eval = NROW(cpsr_clinvar_variants) > 0} +```{r cpsr_clinvar_findings} +#| echo: false +#| eval: true +#| output: asis cat('\n') htmltools::br() -if(NROW(cpsr_clinvar_variants) > 2000){ - cat('NOTE - only considering top 2000 variants (due to limitations with client-side tables)
',sep="\n") +germline_calls <- pcg_report[['content']][['germline_classified']][['callset']] + +if(NROW(germline_calls$variant_display) > 100){ + cat('NOTE - only considering top 100 variants (due to limitations with client-side tables)
',sep="\n") cat('
') - cpsr_clinvar_variants <- - head(cpsr_clinvar_variants, 2000) + germline_calls$variant_display <- + head(germline_calls$variant_display, 100) } -cpsr_clinvar_variants_ctalk <- crosstalk::SharedData$new(cpsr_clinvar_variants) -crosstalk::bscols( - list( - crosstalk::filter_select("CLINVAR_CLASSIFICATION", "Clinical significance", cpsr_clinvar_variants_ctalk, ~CLINVAR_CLASSIFICATION), - crosstalk::filter_select("CONSEQUENCE", "Consequence", cpsr_clinvar_variants_ctalk, ~CONSEQUENCE), - crosstalk::filter_checkbox("GENOTYPE", "Genotype", cpsr_clinvar_variants_ctalk, ~GENOTYPE), - crosstalk::filter_select("SYMBOL", "Gene", cpsr_clinvar_variants_ctalk, ~SYMBOL), - crosstalk::filter_slider("GERP_SCORE","Genomic conservation score (GERP)",cpsr_clinvar_variants_ctalk, ~GERP_SCORE, - min = -12.3, max = 6.17, ticks = T), - crosstalk::filter_slider(tag_gnomad, paste0("MAF gnomAD (",desc_gnomad,")"), cpsr_clinvar_variants_ctalk, formula_gnomad, sep="",ticks = F) - ), - list( - crosstalk::filter_select("miRNA_TARGET_HIT", "miRNA target gain/loss", cpsr_clinvar_variants_ctalk, ~miRNA_TARGET_HIT), - crosstalk::filter_select("TF_BINDING_SITE_VARIANT", "TF binding site alteration", cpsr_clinvar_variants_ctalk, ~TF_BINDING_SITE_VARIANT), - crosstalk::filter_select("CLINVAR_PHENOTYPE", "ClinVar phenotype(s)", cpsr_clinvar_variants_ctalk, ~CLINVAR_PHENOTYPE), - crosstalk::filter_slider("CLINVAR_REVIEW_STATUS_STARS", "ClinVar review status stars", cpsr_clinvar_variants_ctalk, ~CLINVAR_REVIEW_STATUS_STARS, min = 0, max = 4, step = 1, ticks = T), - crosstalk::filter_select("CLINVAR_CONFLICTED", "ClinVar conflicting interpretations", cpsr_clinvar_variants_ctalk, ~CLINVAR_CONFLICTED) - ) -) - -htmltools::br() -htmltools::br() - -DT::datatable( - cpsr_clinvar_variants_ctalk, +germline_dt <- DT::datatable( + germline_calls$variant_display, escape = F, extensions = c("Buttons","Responsive"), options = list( - pageLength = 8, + pageLength = 10, scrollCollapse = T, buttons = c('csv','excel'), dom = 'Bfrtip' - ) -) |> + )) |> DT::formatStyle( - columns = c("SYMBOL"), - valueColumns = c("CLINVAR_CLASSIFICATION"), + columns = c("SYMBOL","ALTERATION"), + valueColumns = c("CLINICAL_SIGNIFICANCE"), color = "white", backgroundColor = DT::styleEqual( @@ -156,130 +66,26 @@ DT::datatable( ) ) -htmltools::br() -htmltools::br() - -``` - -

CPSR - Non-ClinVar variants

- -* For sample __`r cpsr_rep[['metadata']][['sample_name']]`__, a total of n = __`r NROW(cpsr_other_variants)`__ non-ClinVar **germline variants** -are classified with a Pathogenic/Likely pathogenic/VUS clinical significance according to CPSR. - - -```{r cpsr_other_findings, echo=F, results = 'asis', eval = NROW(cpsr_other_variants) > 0} - -cat('\n') -htmltools::br() - -if(NROW(cpsr_other_variants) > 2000){ - cat('NOTE - only considering top 2000 variants (due to limitations with client-side tables)
',sep="\n") - cat('
') - cpsr_other_variants <- - head(cpsr_other_variants, 2000) -} - -cpsr_other_variants_ctalk <- crosstalk::SharedData$new(cpsr_other_variants) -crosstalk::bscols( - list( - crosstalk::filter_select("CPSR_CLASSIFICATION", "Clinical significance", cpsr_other_variants_ctalk, ~CPSR_CLASSIFICATION), - crosstalk::filter_select("CONSEQUENCE", "Consequence", cpsr_other_variants_ctalk, ~CONSEQUENCE), - crosstalk::filter_checkbox("GENOTYPE", "Genotype", cpsr_other_variants_ctalk, ~GENOTYPE), - crosstalk::filter_select("SYMBOL", "Gene", cpsr_other_variants_ctalk, ~SYMBOL), - crosstalk::filter_slider("GERP_SCORE","Genomic conservation score (GERP)",cpsr_other_variants_ctalk, ~GERP_SCORE, - min = -12.3, max = 6.17, ticks = T) - ), - list( - crosstalk::filter_select("miRNA_TARGET_HIT", "miRNA target gain/loss", cpsr_other_variants_ctalk, ~miRNA_TARGET_HIT), - crosstalk::filter_select("TF_BINDING_SITE_VARIANT", "TF binding site alteration", cpsr_other_variants_ctalk, ~TF_BINDING_SITE_VARIANT), - crosstalk::filter_select("CPSR_CLASSIFICATION_CODE", "CPSR classification (ACMG criteria codes)", cpsr_other_variants_ctalk, ~CPSR_CLASSIFICATION_CODE), - crosstalk::filter_slider("CPSR_PATHOGENICITY_SCORE", "CPSR pathogenicity score", cpsr_other_variants_ctalk, ~CPSR_PATHOGENICITY_SCORE, step = 0.5, ticks = T), - crosstalk::filter_slider(tag_gnomad, paste0("MAF gnomAD (",desc_gnomad,")"), cpsr_other_variants_ctalk, formula_gnomad, sep="",ticks = F) +bslib::page_fillable( + bslib::card( + bslib::card_header( + class = "bg-dark", + paste0("Germline variants - ", + pcg_report[['content']][['germline_classified']]$sample_id) + ), + bslib::card_body( + height = min(500, 150 + NROW(germline_calls$variant_display) * 80), + if(NROW(germline_calls$variant_display) > 0){ + germline_dt + }else{ + "NO cancer-predisposing variants of clinical significance were found in the query case (CPSR report)." + } + ) ) ) -htmltools::br() -htmltools::br() -DT::datatable( - cpsr_other_variants_ctalk, - escape = F, - extensions = c("Buttons","Responsive"), - options = list( - pageLength = 8, - scrollCollapse = T, - buttons = c('csv','excel'), - dom = 'Bfrtip' - ) -) |> - DT::formatStyle( - columns = c("SYMBOL"), - valueColumns = c("CPSR_CLASSIFICATION"), - color = "white", - backgroundColor = DT::styleEqual( - c("Pathogenic", "Likely_Pathogenic","VUS"), - c("#9E0142","#D53E4F","#000000")) - ) - -htmltools::br() -htmltools::br() ``` -

CPSR - Biomarkers

- -* CPSR detected __n = `r NROW(cpsr_biomarker_eitems)`__ biomarker evidence items that overlap **germline variants** in the input sample. - -```{r cpsr_biomarker_note, echo=F, results = "asis", include = NROW(cpsr_biomarker_eitems) > 0} - -cat('NOTE: Reported biomarkers in CIViC are mapped at different resolutions (i.e. filter Biomarker resolution). The accuracy of a match between variants in the tumor sample and the reported biomarkers will vary accordingly (highlighted by gene symbols with different color backgrounds):\n\n') - -cat('
  • Biomarker match at the exact variant/codon level
  • ') -cat(paste0('
  • Biomarker match at the exon/gene level
\n')) - -htmltools::br() -``` - - -```{r cpsr_list_biomarker_eitems, echo=F, results = 'asis', eval = NROW(cpsr_biomarker_eitems) > 0} - - -cat('The table below lists all variant-evidence item associations:',sep='\n') -htmltools::br() - -DT::datatable( - cpsr_biomarker_eitems, - escape=F, - extensions = - c("Buttons","Responsive"), - options = - list(buttons = c('csv','excel'), - pageLength = 5, - dom = 'Bfrtip') - ) |> - DT::formatStyle( - 'BM_EVIDENCE_LEVEL', - backgroundColor = DT::styleEqual( - c('A: Validated', - 'A: FDA/NCCN/ELN guidelines', - 'B: Clinical evidence', - 'B1: Clinical evidence: late trials', - 'B2: Clinical evidence: early trials', - 'C: Case study', - 'D: Preclinical evidence', - 'E: Indirect evidence'), - c("#009E73","#009E73","#56B4E9", - "#56B4E9","#56B4E9","#0072B2", - "#E69F00", "#F0E442"))) |> - DT::formatStyle( - color="white", "SYMBOL", - "BM_RESOLUTION", - fontWeight = 'bold', `text-align` = 'center', - backgroundColor = DT::styleEqual( - c('exact','codon','exon','gene'), c('#000','#000', cpsr_rep[['metadata']][['color_palette']][['warning']][['values']][1], cpsr_rep[['metadata']][['color_palette']][['warning']][['values']][1]))) - - - -``` - -

+
diff --git a/pcgrr/inst/templates/pcgr_quarto_report/kataegis.qmd b/pcgrr/inst/templates/pcgr_quarto_report/kataegis.qmd index 9efe6c61..77e5b2d0 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/kataegis.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/kataegis.qmd @@ -1,6 +1,6 @@ ## Kataegis events -Kataegis describes a pattern of localized hypermutations identified in some cancer genomes, in which a large number of highly-patterned basepair mutations occur in a small region of DNA (ref Wikipedia). Kataegis is prevalently seen among breast cancer patients, and it is also exists in lung cancers, cervical, head and neck, and bladder cancers, as shown in the results from tracing APOBEC mutation signatures (ref Wikipedia). PCGR implements the kataegis detection algorithm outlined in the [KataegisPortal R package](https://github.com/MeichunCai/KataegisPortal). +Kataegis describes a pattern of localized hypermutations identified in some cancer genomes, in which [a large number of highly-patterned basepair mutations occur in a small region of DNA](https://en.wikipedia.org/wiki/Kataegis). Kataegis is prevalently seen among breast cancer patients, and it is also exists in lung cancers, cervical, head and neck, and bladder cancers, as shown in the results from tracing APOBEC mutation signatures (ref Wikipedia). PCGR implements the kataegis detection algorithm outlined in the [KataegisPortal R package](https://github.com/MeichunCai/KataegisPortal). Explanation of key columns in the resulting table of potential kataegis events: @@ -12,11 +12,21 @@ Explanation of key columns in the resulting table of potential kataegis events: - 3 - high confidence with three or more hypermutations with weight.C>X >= 0.8 in a chromosome) -```{r mutsigs_kataegis, echo = F, eval = T} - -df <- data.frame('sample_id' = character(), 'chrom' = character(), 'start' = integer(), 'end' = integer(), - 'chrom.arm' = character(), 'length' = integer(), 'number.mut' = integer(), 'weight.C>X' = numeric(), - 'confidence' = integer(), stringsAsFactors = F) +```{r mutsigs_kataegis} +#| echo: false +#| eval: true + +df <- data.frame( + 'sample_id' = character(), + 'chrom' = character(), + 'start' = integer(), + 'end' = integer(), + 'chrom.arm' = character(), + 'length' = integer(), + 'number.mut' = integer(), + 'weight.C>X' = numeric(), + 'confidence' = integer(), + stringsAsFactors = F) if(is.data.frame(pcg_report$content$kataegis$events)){ df <- pcg_report$content$kataegis$events diff --git a/pcgrr/inst/templates/pcgr_quarto_report/msi.qmd b/pcgrr/inst/templates/pcgr_quarto_report/msi.qmd index ee67bf18..435f0bbe 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/msi.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/msi.qmd @@ -2,7 +2,7 @@ Microsatellite instability (MSI) is the result of impaired DNA mismatch repair and constitutes a cellular phenotype of clinical significance in many cancer types, most prominently colorectal cancers, stomach cancers, endometrial cancers, and ovarian cancers ([Cortes-Ciriano et al., 2017](https://www.ncbi.nlm.nih.gov/pubmed/28585546)). We have built a statistical MSI classifier that only considers features of the somatic mutation profile (e.g. _fraction of indels_, _load of indels_, _mutations in MMR genes_ etc.) in order to separate _MSI.H_ (MSI-high) from _MSS_ (MS stable) tumors. -The MSI classifier was trained using 999 exome-sequenced TCGA tumor samples with known MSI status (i.e. assayed from mononucleotide markers), and obtained a [positive predictive value](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values#Positive_predictive_value) of 98.9% and a [negative predictive value](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values#Negative_predictive_value) of 98.8% on an independent test set of 427 samples. Details of the MSI classification approach can be found here. +The MSI classifier was trained using __N = 1,065__ exome-sequenced TCGA tumor samples with known MSI status (i.e. assayed from mononucleotide markers), and obtained a [positive predictive value](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values#Positive_predictive_value) of 97.9% and a [negative predictive value](https://en.wikipedia.org/wiki/Positive_and_negative_predictive_values#Negative_predictive_value) of 99.4% on an independent test set of __N = 435 samples__. Details of the MSI classification approach can be found here.
diff --git a/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd b/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd index bf288c7d..a6131725 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/mutational_signature.qmd @@ -152,7 +152,8 @@ cat("

") cat("\n\n::: {.callout-important}\n\n## Suboptimal fit of mutational profile\n\n ", "\nReconstruction of the input mutational profile with the selected COSMIC reference", - " signatures resulted in a sub-optimal fit (cosine similarity < 0.9)", + " signatures resulted in a sub-optimal fit (cosine similarity < 0.9). ", + "A re-run with all reference signatures included is warranted (option `--all_reference_signatures`)", "\n\n:::\n\n", sep="") @@ -257,6 +258,7 @@ pie_aetiologies <- plotly::plot_ly( plotly::layout( legend = list(orientation = "h", font = t)) +pie_aetiologies$x$layout$margin$b <- 5 bslib::page_fillable( bslib::layout_columns( col_widths = c(4,8), @@ -288,7 +290,7 @@ bslib::page_fillable( ) ), bslib::card( - height = "340px", + height = "380px", bslib::card_header( class = "bg-dark", paste0("Key underlying aetiologies - ", diff --git a/pcgrr/inst/templates/pcgr_quarto_report/settings.qmd b/pcgrr/inst/templates/pcgr_quarto_report/settings.qmd index eea562f0..556ce394 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/settings.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/settings.qmd @@ -33,6 +33,10 @@ if(as.logical(conf$assay_properties$vcf_tumor_only) == TRUE){ success_color <- pcgrr::color_palette$success +include_germline_findings <- 'OFF' +if(pcg_report$content$germline_classified$eval == T){ + include_germline_findings <- 'ON' +} if(pcg_report$content$msi$eval == F){ msi_prediction <- 'OFF' } @@ -64,9 +68,10 @@ The report is generated with _[pcgr](https://github.com/sigven/pcgr)_ version `r * Minimum sequencing depth (DP) control (SNV + InDels): __`r conf[['somatic_snv']][['allelic_support']][['control_dp_min']]`__ * Maximum allelic fraction (AF) control (SNV + InDels): __`r conf[['somatic_snv']][['allelic_support']][['control_af_max']]`__ * Variant actionability guidelines: __AMP/ASCO/CAP__ -* Variant oncogenicity guidelines: __VICC/ClinGen__ +* Variant oncogenicity guidelines: __ClinGen/CGC/VICC__ * Show noncoding variants: __`r conf$other$show_noncoding `__ * MSI prediction: __`r msi_prediction`__ +* Include germline findings: __`r include_germline_findings`__ * Mutational burden estimation: __`r mutational_burden`__ * TMB algorithm: __`r paste0('TMB_',conf$somatic_snv[['tmb']][['tmb_display']])`__ * Minimum total copy number for copy number gains: __`r conf$somatic_cna$n_copy_gain`__ diff --git a/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/actionability.qmd b/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/actionability.qmd index b160940b..0856d361 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/actionability.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/actionability.qmd @@ -162,7 +162,7 @@ DT::datatable( `text-align` = 'center', backgroundColor = DT::styleEqual( c('genomic','hgvsp','codon', - 'exon','gene_region_mut', + 'exon','gene_region_mut','other', 'gene_lof','gene_mut'), c(pcgrr::color_palette$bg_dark, pcgrr::color_palette$bg_dark, @@ -170,6 +170,7 @@ DT::datatable( pcgrr::color_palette[['warning']], pcgrr::color_palette[['warning']], pcgrr::color_palette[['warning']], + pcgrr::color_palette[['warning']], pcgrr::color_palette[['warning']]) ) ) @@ -327,13 +328,14 @@ DT::datatable( backgroundColor = DT::styleEqual( c('hgvsp','genomic','codon', 'exon','gene_region_mut', - 'gene_lof','gene_mut'), + 'gene_lof','gene_mut','other'), c(pcgrr::color_palette$bg_dark, pcgrr::color_palette$bg_dark, pcgrr::color_palette$bg_dark, pcgrr::color_palette[['warning']], pcgrr::color_palette[['warning']], pcgrr::color_palette[['warning']], + pcgrr::color_palette[['warning']], pcgrr::color_palette[['warning']]) ) ) diff --git a/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/oncogenicity.qmd b/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/oncogenicity.qmd index 72d7d9d4..1535f27f 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/oncogenicity.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/oncogenicity.qmd @@ -1,6 +1,8 @@ ### Variant classification - oncogenicity -Annotations for all short variants detected in the tumor sample can here be interrogated, e.g. with respect to gene, variant consequence, variant allelic fraction/sequencing depth (if provided as input), and also with respect to _predicted oncogenicity_. For the latter, PCGR implements standard operating procedures for oncogenicity evaluation developed by ClinGen/VICC [@Horak2022-uh]. The tabset below provides an overview of coding (protein-altering) versus non-coding variants found in the tumor sample. +Annotations for short variants (SNVs/InDels) detected in the tumor sample can here be interrogated, e.g. with respect to gene, variant consequence, variant allelic fraction/sequencing depth (if provided as input), and also with respect to _predicted oncogenicity_. For the latter, PCGR implements standard operating procedures for oncogenicity evaluation developed by ClinGen/CGC/VICC [@Horak2022-uh]. + +The tabset below provides an overview of coding (protein-altering) versus non-coding variants found in the tumor sample. In addition to predicted oncogenicity, we also explicitly highlight tumor variants that are registered with a  pathogenic/likely pathogenic classification in ClinVar   , considering variants of any origin (somatic/germline), and associated with any condition/phenotype. ::: {.callout-note} @@ -14,7 +16,7 @@ In order to maintain responsiveness of client-side data interaction, only the to ## Note - oncogenicity versus actionability -The oncogenicity classification of variants performed in PCGR is based on an implementation of the ClinGen/VICC SOPs for variant oncogenicity evaluation. This classification is based on multiple properties of variants (hotspot occurrence, variant consequence (i.e. loss-of-function), population frequency etc.). Notably, the current implementation may provide classifications that do not always correspond with clinical actionability, e.g. due to the fact the functional variant evidence (frequently requiring manual curation) is limited in the reference data coming with PCGR. This implies that the oncogenic classification is somewhat conservative in its current form, and that variants classified as VUS when it comes to oncogenicity by PCGR (yet still with a relatively high _oncogenicity score_), may still be listed as actionable below. When more functional variant evidence gets integrated in PCGR (e.g. from [multiplexed assays of variant effect (MAVE) efforts](https://mavedb.org/#/)), the concordance between oncogenicity and actionability is likely to improve further. +The oncogenicity classification of variants performed in PCGR is based on an implementation of the ClinGen/CGC/VICC SOPs for variant oncogenicity evaluation. This classification is based on multiple properties of variants (hotspot occurrence, variant consequence (i.e. loss-of-function), population frequency etc.). Notably, the current implementation may provide classifications that do not always correspond with clinical actionability, e.g. due to the fact the functional variant evidence (frequently requiring manual curation) is limited in the reference data coming with PCGR. This implies that the oncogenic classification is somewhat conservative in its current form, and that variants classified as VUS when it comes to oncogenicity by PCGR (yet still with a relatively high _oncogenicity score_), may still be listed as actionable below. When more functional variant evidence gets integrated in PCGR (e.g. from [multiplexed assays of variant effect (MAVE) efforts](https://mavedb.org/#/)), the concordance between oncogenicity and actionability is likely to improve further. ::: @@ -43,6 +45,33 @@ coding_variant_set <- ) |> head(1000) + +if(NROW(coding_variant_set[!is.na(coding_variant_set$ONCOGENICITY) & + coding_variant_set$ONCOGENICITY == "VUS",]) > 0){ + coding_variant_set <- coding_variant_set |> + dplyr::mutate(onc_vus_path_clinvar = dplyr::if_else( + !is.na(ONCOGENICITY) & + ONCOGENICITY == "VUS" & + !is.na(CLINVAR_CLASSIFICATION) & + stringr::str_detect( + tolower(CLINVAR_CLASSIFICATION), + "pathogenic" + ), + as.integer(1), + as.integer(0) + )) + + coding_variant_set[!is.na(coding_variant_set$ONCOGENICITY) & + coding_variant_set$ONCOGENICITY == "VUS",] <- + coding_variant_set[!is.na(coding_variant_set$ONCOGENICITY) & + coding_variant_set$ONCOGENICITY == "VUS",] |> + dplyr::arrange(dplyr::desc(onc_vus_path_clinvar), + dplyr::desc(ONCOGENICITY_SCORE)) + coding_variant_set$onc_vus_path_clinvar <- NULL + +} + + # if(expression_present_snv == FALSE){ # coding_variant_set$consTPM <- NULL # } @@ -131,6 +160,16 @@ dt_all_coding <- DT::datatable( pcgrr::color_palette$cancer_assoc$breaks, pcgrr::color_palette$cancer_assoc$values ) + ) |> + DT::formatStyle( + color = "black", + "ALTERATION", + "CLINVAR_CLASSIFICATION", + backgroundColor = + DT::styleEqual( + c("Pathogenic","Likely_Pathogenic"), + c("#FF8790", "#FF8790") + ) ) # if(expression_present_snv == TRUE){ @@ -172,7 +211,8 @@ noncoding_variant_set <- dplyr::select(-c("PROTEIN_CHANGE","PROTEIN_DOMAIN", "MUTATION_HOTSPOT", "MUTATION_HOTSPOT_CANCERTYPE", - "HGVSp","PREDICTED_EFFECT", + "HGVSc_RefSeq", + "PREDICTED_EFFECT", "TARGETED_INHIBITORS", "TARGETED_INHIBITORS_ALL")) |> dplyr::arrange( @@ -182,6 +222,72 @@ noncoding_variant_set <- ) |> head(1000) + +if(NROW(noncoding_variant_set[!is.na(noncoding_variant_set$ONCOGENICITY) & + noncoding_variant_set$ONCOGENICITY == "VUS",]) > 0){ + noncoding_variant_set <- noncoding_variant_set |> + dplyr::mutate(onc_vus_path_clinvar = dplyr::if_else( + !is.na(.data$ONCOGENICITY) & + .data$ONCOGENICITY == "VUS" & + !is.na(.data$CLINVAR_CLASSIFICATION) & + stringr::str_detect( + tolower(.data$CLINVAR_CLASSIFICATION), + "pathogenic" + ), + as.integer(1), + as.integer(0) + )) + + noncoding_variant_set[!is.na(noncoding_variant_set$ONCOGENICITY) & + noncoding_variant_set$ONCOGENICITY == "VUS",] <- + noncoding_variant_set[!is.na(noncoding_variant_set$ONCOGENICITY) & + noncoding_variant_set$ONCOGENICITY == "VUS",] |> + dplyr::arrange(dplyr::desc(onc_vus_path_clinvar), + dplyr::desc(ONCOGENICITY_SCORE)) + noncoding_variant_set$onc_vus_path_clinvar <- NULL + +} + + +# if(expression_present_snv == FALSE){ +# coding_variant_set$consTPM <- NULL +# } +# +# coding_variants_shared <- +# crosstalk::SharedData$new(coding_variant_set) +# +# +# +# +# +# +# +# +# +# noncoding_variant_set <- +# pcg_report$content$snv_indel$callset$variant_display |> +# dplyr::filter( +# .data$CODING_STATUS == "noncoding" & +# .data$CONSEQUENCE != "intergenic_variant") |> +# dplyr::select( +# dplyr::any_of( +# pcgrr::dt_display$snv_indel_tier3 +# ) +# ) |> +# dplyr::select(-c("PROTEIN_CHANGE","PROTEIN_DOMAIN", +# "MUTATION_HOTSPOT", +# "MUTATION_HOTSPOT_CANCERTYPE", +# "HGVSc_RefSeq", +# "PREDICTED_EFFECT", +# "TARGETED_INHIBITORS", +# "TARGETED_INHIBITORS_ALL")) |> +# dplyr::arrange( +# dplyr::desc(.data$ONCOGENICITY_SCORE), +# dplyr::desc(.data$TISSUE_ASSOC_RANK), +# dplyr::desc(.data$GLOBAL_ASSOC_RANK), +# ) |> +# head(1000) + # if(expression_present_snv == FALSE){ # noncoding_variant_set$consTPM <- NULL # } @@ -286,15 +392,40 @@ dt_all_noncoding #| eval: !expr as.logical(pcg_report$settings$conf$assay_properties$vcf_tumor_only) == TRUE cat("\n#### Filtered variants\n\n") -cat("The table below lists all the variants that have been subject to exclusion/filtering", - " from the raw input callset, as an effort to minimize the presence of known\n\n", - " germline events and sequencing artifacts. The filtering has been performed based on the", "following criteria:\n\n") +cat("The table below lists all the variants that have been subject to", + " exclusion/filtering (according to filtering settings indicated above)", + " from the raw input callset, as an effort to minimize the presence of known", + " germline events and sequencing artifacts.\n\n") ```
+```{r num_filtered_variants_shown} +#| output: asis +#| eval: !expr as.logical(pcg_report$settings$conf$assay_properties$vcf_tumor_only) == TRUE + +if(NROW(pcg_report$content$snv_indel$callset$variant_unfiltered) > 0 & + "SOMATIC_CLASSIFICATION" %in% colnames(pcg_report$content$snv_indel$callset$variant_unfiltered)){ + + filtered_calls <- + pcg_report$content$snv_indel$callset$variant_unfiltered |> + dplyr::filter(.data$SOMATIC_CLASSIFICATION != "SOMATIC") |> + dplyr::rename(EXCLUSION_CRITERIA = "SOMATIC_CLASSIFICATION") + + if(NROW(filtered_calls) > 2000){ + cat("\n::: {.callout-warning}\n## Filtered variant set too large for display\n\n", + "The number of variants subject to filtering (N = ", NROW(filtered_calls), ") ", + "is too large (N > 2,000) to display in an interactive table. See the output TSV file ", + "with all filtered (and unfiltered variants) for the full list.\n\n", + ":::\n\n", sep="") + } +} + +``` + + ```{r filtered_variants_datatable} #| output: asis #| eval: !expr as.logical(pcg_report$settings$conf$assay_properties$vcf_tumor_only) == TRUE @@ -307,7 +438,7 @@ if(NROW(pcg_report$content$snv_indel$callset$variant_unfiltered) > 0 & dplyr::filter(.data$SOMATIC_CLASSIFICATION != "SOMATIC") |> dplyr::rename(EXCLUSION_CRITERIA = "SOMATIC_CLASSIFICATION") - if(NROW(filtered_calls) > 0){ + if(NROW(filtered_calls) > 0 & NROW(filtered_calls) <= 2000 ){ filtered_calls <- filtered_calls |> dplyr::select( dplyr::any_of( @@ -327,7 +458,7 @@ if(NROW(pcg_report$content$snv_indel$callset$variant_unfiltered) > 0 & color = "white", "EXCLUSION_CRITERIA", "EXCLUSION_CRITERIA", - backgroundColor = pcgrr::color_palette$report_color$values[1] + backgroundColor = pcgrr::color_palette$report_color$values[2] ) } } diff --git a/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/variant_statistics.qmd b/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/variant_statistics.qmd index abebdf69..bac50713 100644 --- a/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/variant_statistics.qmd +++ b/pcgrr/inst/templates/pcgr_quarto_report/snv_indel/variant_statistics.qmd @@ -1,54 +1,14 @@ ```{r prep_data} #| eval: !expr as.logical(pcg_report$content$snv_indel$vstats$n > 0) == TRUE -hole_size <- 0.4 -pie_line_width <- 3 -consequence_stats <- - pcg_report$content$snv_indel$callset$variant_display |> - dplyr::mutate(CONSEQUENCE = stringr::str_replace_all( - CONSEQUENCE, "(, [0-9A-Za-z_]{1,}){1,}$","" - )) |> - dplyr::group_by(CONSEQUENCE) |> - dplyr::summarise( - N = dplyr::n(), - .groups = "drop" - ) |> - dplyr::mutate(Pct = N / sum(N) * 100) |> - dplyr::mutate(CONSEQUENCE = dplyr::if_else( - Pct < 3, "other_consequence", - as.character(CONSEQUENCE) - )) |> - dplyr::group_by(CONSEQUENCE) |> - dplyr::summarise( - N = sum(N), - Pct = sum(Pct), - .groups = "drop" - ) |> - dplyr::arrange(dplyr::desc(N)) - -vartype_stats <- - pcg_report$content$snv_indel$callset$variant_display |> - dplyr::group_by(VARIANT_CLASS) |> - dplyr::summarise( - N = dplyr::n(), - .groups = "drop" - ) |> - dplyr::mutate(Pct = N / sum(N) * 100) |> - dplyr::arrange(dplyr::desc(Pct)) - -coding_stats <- - pcg_report$content$snv_indel$callset$variant_display |> - dplyr::group_by(CODING_STATUS) |> - dplyr::summarise( - N = dplyr::n(), - .groups = "drop" - ) |> - dplyr::mutate(Pct = N / sum(N) * 100) |> - dplyr::arrange(dplyr::desc(Pct)) - -#if(min) +variant_stats <- pcgrr::get_variant_statistics( + var_df = pcg_report$content$snv_indel$callset$variant_display, + pct_other_limit = 3.5 +) +hole_size <- 0.4 +pie_line_width <- 3 t <- list( family = "Helvetica", size = 15) @@ -56,14 +16,14 @@ t <- list( varstats_plots_plotly <- list() varstats_plots_plotly[['coding']] <- plotly::plot_ly( - coding_stats, + variant_stats[['coding']], marker = list( colors = pcgrr::color_palette$tier$values, line = list( color = '#FFFFFF', width = pie_line_width))) |> plotly::add_pie( - coding_stats, + variant_stats[['coding']], labels =~ factor(CODING_STATUS), values = ~N, textinfo = "Pct", @@ -74,14 +34,14 @@ varstats_plots_plotly[['coding']] <- varstats_plots_plotly[['consequence']] <- plotly::plot_ly( - consequence_stats, + variant_stats[['consequence']], marker = list( colors = pcgrr::color_palette$tier$values, line = list( color = '#FFFFFF', width = pie_line_width))) |> plotly::add_pie( - consequence_stats, + variant_stats[['consequence']], labels =~ factor(CONSEQUENCE), values = ~N, textinfo = "Pct", @@ -90,16 +50,40 @@ varstats_plots_plotly[['consequence']] <- plotly::layout( legend = list(orientation = "h", font = t)) +varstats_plots_plotly[['consequence_coding']] <- NULL +if(NROW(variant_stats[['consequence_coding']]) > 0) { + varstats_plots_plotly[['consequence_coding']] <- + plotly::plot_ly( + variant_stats[['consequence_coding']], + marker = list( + colors = pcgrr::color_palette$tier$values, + line = list( + color = '#FFFFFF', + width = pie_line_width))) |> + plotly::add_pie( + variant_stats[['consequence_coding']], + labels =~ factor(CONSEQUENCE), + values = ~N, + textinfo = "Pct", + type = 'pie', + hole = hole_size) |> + plotly::layout( + legend = list(orientation = "h", font = t)) + + varstats_plots_plotly[['consequence_coding']]$x$layout$margin$b <- 5 + +} + varstats_plots_plotly[['type']] <- plotly::plot_ly( - vartype_stats, + variant_stats[['variant_class']], marker = list( colors = pcgrr::color_palette$tier$values, line = list( color = '#FFFFFF', width = pie_line_width))) |> plotly::add_pie( - vartype_stats, + variant_stats[['variant_class']], labels =~ factor(VARIANT_CLASS), values = ~N, textinfo = "Pct", @@ -108,36 +92,24 @@ varstats_plots_plotly[['type']] <- plotly::layout( legend = list(orientation = "h", font = t)) +varstats_plots_plotly[['type']]$x$layout$margin$b <- 5 +varstats_plots_plotly[['consequence']]$x$layout$margin$b <- 5 + ``` -```{r prep_vboxes} +```{r display_varstats} #| output: asis - -statbox <- function( - value = "A value", - title = "A title", - theme = "purple", - height = "90px"){ - return( - bslib::value_box( - height = height, - value = value, - title = title, - showcase = NULL, - theme = theme - ) - ) - -} +#| echo: false +#| eval: true bslib::page_fillable( bslib::layout_columns( - height = "350px", + height = "365px", bslib::card( full_screen = TRUE, - height = "350px", + height = "365px", bslib::card_header( class = "bg-dark", paste0("Variant statistics | coding effect - ", @@ -146,7 +118,7 @@ bslib::page_fillable( ), bslib::card( full_screen = TRUE, - height = "350px", + height = "365px", bslib::card_header( class = "bg-dark", paste0("Variant statistics | type - ", @@ -158,8 +130,8 @@ bslib::page_fillable( full_screen = TRUE, height = "400px", bslib::card_header( - class = "bg-dark", - paste0("Variant statistics | consequence type - ", + class = "bg-dark", + paste0("Variant statistics | any consequence type - ", pcg_report$settings$sample_id)), varstats_plots_plotly[['consequence']] ) @@ -168,7 +140,28 @@ bslib::page_fillable( ``` -```{r prep_plots} +```{r display_coding_cons_varstats} +#| output: asis +#| echo: false +#| eval: !expr as.logical(is.null(varstats_plots_plotly[['consequence_coding']])) == FALSE + +bslib::page_fillable( + bslib::card( + full_screen = TRUE, + height = "400px", + bslib::card_header( + class = "bg-dark", + paste0("Variant statistics | coding consequence type - ", + pcg_report$settings$sample_id)), + varstats_plots_plotly[['consequence_coding']] + ) +) + + +``` + + +```{r prep_vaf_plot} #| echo: false #| eval: !expr as.logical(pcg_report$settings$conf$somatic_snv$allelic_support$tumor_af_tag == "_NA_") == FALSE @@ -252,14 +245,15 @@ rfdat$data$type <- factor( rfdat$data$type, levels = type_stats$x) genomic_dist_plot <- ggplot2::ggplot( - rfdat$data, ggplot2::aes(x = location, y = distance)) + + rfdat$data, ggplot2::aes(x = location, y = dist2prev, z = variant_id)) + ggplot2::geom_point( ggplot2::aes(colour = type), cex = rfdat$cex) + ggplot2::geom_vline( xintercept = as.vector(rfdat$chr_cum), linetype="dotted") + - ggplot2::annotate("text", x = rfdat$intercept, - y = rfdat$ylim, - label = rfdat$labels, + ggplot2::annotate("text", x = rfdat$chrom_midpoints, + #y = rfdat$ylim, + y = 1e+09, + label = rfdat$chrom_labels, cex = rfdat$cex_text) + ggplot2::xlab("Genomic Location") + ggplot2::ylab("Genomic Distance") + @@ -278,8 +272,15 @@ genomic_dist_plot <- ggplot2::ggplot( axis.text.x = ggplot2::element_blank()) + ggplot2::guides(colour = ggplot2::guide_legend(nrow = 1)) -genomic_dist_plot_plotly <- plotly::ggplotly(genomic_dist_plot) +genomic_dist_plot_plotly <- plotly::ggplotly(genomic_dist_plot, tooltip = c("y","z")) genomic_dist_plot_plotly$x$layout$legend$title$text <- "" +genomic_dist_plot_plotly <- genomic_dist_plot_plotly |> + plotly::layout( + legend = list( + orientation = "h", + x = 0.27, + y = -0.30)) + ``` diff --git a/pcgrr/man/data_coltype_defs.Rd b/pcgrr/man/data_coltype_defs.Rd index 335abf6d..d2a87409 100644 --- a/pcgrr/man/data_coltype_defs.Rd +++ b/pcgrr/man/data_coltype_defs.Rd @@ -5,7 +5,7 @@ \alias{data_coltype_defs} \title{List of coltype definitions for input files to pcgrr (e.g. VCF-converted TSV, CNA TVS etc.)} \format{ -An object of class \code{list} of length 3. +An object of class \code{list} of length 5. } \usage{ data_coltype_defs diff --git a/pcgrr/man/get_variant_statistics.Rd b/pcgrr/man/get_variant_statistics.Rd new file mode 100644 index 00000000..5bcec21b --- /dev/null +++ b/pcgrr/man/get_variant_statistics.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/variant_stats.R +\name{get_variant_statistics} +\alias{get_variant_statistics} +\title{Function that computes various variant statistics from a data frame +with variant records} +\usage{ +get_variant_statistics(var_df = NULL, pct_other_limit = 4) +} +\arguments{ +\item{var_df}{data frame with variants} + +\item{pct_other_limit}{numeric value specifying the percentage limit +for the 'Other' category} +} +\description{ +Function that computes various variant statistics from a data frame +with variant records +} diff --git a/pcgrr/man/load_cpsr_classified_variants.Rd b/pcgrr/man/load_cpsr_classified_variants.Rd new file mode 100644 index 00000000..ceb62c47 --- /dev/null +++ b/pcgrr/man/load_cpsr_classified_variants.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/input_data.R +\name{load_cpsr_classified_variants} +\alias{load_cpsr_classified_variants} +\title{Function that reads CPSR-classified variants from a TSV file} +\usage{ +load_cpsr_classified_variants( + fname_cpsr_tsv = NA, + fname_cpsr_yaml = NA, + cols = NULL, + ignore_vus = FALSE, + ref_data = NULL +) +} +\arguments{ +\item{fname_cpsr_tsv}{Path to raw input file with CPSR-classified SNVs/InDels} + +\item{fname_cpsr_yaml}{Path to YAML configuration file for CPSR analysis} + +\item{cols}{column type definitions of raw input file} + +\item{ignore_vus}{logical indicating if VUS should be ignored in report} + +\item{ref_data}{PCGR reference data object} +} +\description{ +Function that reads CPSR-classified variants from a TSV file +} diff --git a/pcgrr/man/load_expression_outliers.Rd b/pcgrr/man/load_expression_outliers.Rd index eecb9598..2fcd05c2 100644 --- a/pcgrr/man/load_expression_outliers.Rd +++ b/pcgrr/man/load_expression_outliers.Rd @@ -7,8 +7,9 @@ load_expression_outliers( settings = NULL, ref_data = NULL, - percentile_cutoff_high = 90, - percentile_cutoff_low = 10 + percentile_cutoff_high = 95, + percentile_cutoff_low = 5, + z_score_cutoff = 1.5 ) } \arguments{ @@ -19,6 +20,8 @@ load_expression_outliers( \item{percentile_cutoff_high}{numeric, percentile cutoff for high expression} \item{percentile_cutoff_low}{numeric, percentile cutoff for low expression} + +\item{z_score_cutoff}{numeric, z-score cutoff for expression outliers} } \description{ Load expression outlier results diff --git a/pcgrr/man/load_somatic_cna.Rd b/pcgrr/man/load_somatic_cna.Rd index ce3bec29..ba657a63 100644 --- a/pcgrr/man/load_somatic_cna.Rd +++ b/pcgrr/man/load_somatic_cna.Rd @@ -2,19 +2,26 @@ % Please edit documentation in R/input_data.R \name{load_somatic_cna} \alias{load_somatic_cna} -\title{Function that reads and validates a fully annotated CNA file from PCGR -pre-report pipeline} +\title{Function that reads and validates fully annotated CNA data (segments and genes) +from PCGR pre-reporting pipeline} \usage{ -load_somatic_cna(fname, ref_data = NULL, settings = NULL) +load_somatic_cna( + fname_cna_segment = NULL, + fname_cna_gene = NULL, + ref_data = NULL, + settings = NULL +) } \arguments{ -\item{fname}{Path to file with pre-processed CNA segments} +\item{fname_cna_segment}{Path to file with pre-processed CNA segments} + +\item{fname_cna_gene}{Path to file with pre-processed CNA gene-level data} \item{ref_data}{PCGR reference data object} \item{settings}{PCGR run/configuration settings} } \description{ -Function that reads and validates a fully annotated CNA file from PCGR -pre-report pipeline +Function that reads and validates fully annotated CNA data (segments and genes) +from PCGR pre-reporting pipeline } diff --git a/pcgrr/man/plot_cna_segments.Rd b/pcgrr/man/plot_cna_segments.Rd new file mode 100644 index 00000000..dc4090b8 --- /dev/null +++ b/pcgrr/man/plot_cna_segments.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cna.R +\name{plot_cna_segments} +\alias{plot_cna_segments} +\title{Plot allele-specific copy number segments} +\usage{ +plot_cna_segments( + chrom_coordinates = NULL, + cna_segment = NULL, + cna_gene = NULL +) +} +\arguments{ +\item{chrom_coordinates}{data frame with assembly-specific chromosome coordinate data (length etc)} + +\item{cna_segment}{data frame with annotated copy number segments} + +\item{cna_gene}{data frame with gene-level copy number data} +} +\description{ +Function that plots allele-specific copy number segments +(minor + total allele copies) +} diff --git a/pcgrr/man/tsv_cols.Rd b/pcgrr/man/tsv_cols.Rd index 2df63f46..796b5a52 100644 --- a/pcgrr/man/tsv_cols.Rd +++ b/pcgrr/man/tsv_cols.Rd @@ -5,7 +5,7 @@ \alias{tsv_cols} \title{TSV columns} \format{ -An object of class \code{list} of length 2. +An object of class \code{list} of length 3. } \usage{ tsv_cols diff --git a/pcgrr/man/write_report_tsv.Rd b/pcgrr/man/write_report_tsv.Rd index 7fdaa5fb..b5cd7d59 100644 --- a/pcgrr/man/write_report_tsv.Rd +++ b/pcgrr/man/write_report_tsv.Rd @@ -11,7 +11,7 @@ write_report_tsv(report = NULL, output_type = "snv_indel") \item{report}{List object with all report data, settings etc.} \item{output_type}{character indicating output type for TSV, -i.e. 'snv_indel' or 'cna_gene', 'msigs'} +i.e. 'snv_indel', 'snv_indel_unfiltered', 'cna_gene', or 'msigs'} } \description{ param report List object with all report data (PCGR/CPSR), settings etc. diff --git a/pcgrr/pkgdown/index.md b/pcgrr/pkgdown/index.md index 9c9226c8..8b157f5b 100644 --- a/pcgrr/pkgdown/index.md +++ b/pcgrr/pkgdown/index.md @@ -4,13 +4,13 @@ editor_options: wrap: 72 --- -# Personal Cancer Genome Reporter (PCGR) +# Personal Cancer Genome Reporter (PCGR) The Personal Cancer Genome Reporter (PCGR) is a stand-alone software package for functional annotation and translation of individual tumor genomes for precision cancer medicine. It interprets primarily somatic SNVs/InDels and copy number aberrations, and has additional support for interpretation of bulk RNA-seq expression data. The software classifies variants both with respect to _oncogenicity_, and _actionability_. Interactive HTML output reports allow the user to interrogate the clinical impact of the molecular findings in an individual tumor. - Variant classification - - according to *oncogenicity*: evaluating the oncogenic potential of somatic DNA aberrations ([VICC/CGC/ClinGen guidelines](https://pubmed.ncbi.nlm.nih.gov/35101336/)) + - according to *oncogenicity*: evaluating the oncogenic potential of somatic DNA aberrations ([ClinGen/CGC/VICC guidelines](https://pubmed.ncbi.nlm.nih.gov/35101336/)) - according to *actionability*: mapping the therapeutic, diagnostic, and prognostic implications of somatic DNA aberrations ([AMP/ASCO/CAP guidelines](https://pubmed.ncbi.nlm.nih.gov/27993330/)) - Tumor mutational burden (TMB) estimation - Mutational signature analysis @@ -25,23 +25,29 @@ Example screenshots from the [quarto](https://quarto.org)-based cancer genome re ![PCGR screenshot 2](img/sc1.png) ![PCGR screenshot 3](img/sc3.png) -PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](http://cancergenomics.no), at the [Institute for Cancer Research, Oslo University Hospital, Norway](http://radium.no). +PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](https://cancergenomics.no), at the [Institute for Cancer Research, Oslo University Hospital, Norway](https://radium.no). ### Top News + +- *September 29th 2024*: **2.1.0 release** + - updated bundle, more oncogenic variants, CNA visualization, + improved RNA-seq support, bug fixes, and more + - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html) + - *August 1st 2024*: **2.0.3 release** - patch to fix purity/ploidy propagation, MAF output for tumor-only runs, and other minor issues - - [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) + - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html) - *July 16th 2024*: **2.0.2 release** - patch to ensure correct reference to actionability guidelines - - [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) + - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html) - *July 7th 2024*: **2.0.1 release** - patch with bug fix for mitochondrial input variants ([pr245](https://github.com/sigven/pcgr/pull/245)) - - [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) + - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html) - *June 2024*: **2.0.0 release** - - Details in [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) + - Details in [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html) - Massive reference data bundle upgrade, new report layout, oncogenicity classification++ - Support for Singularity/Apptainer - Major data/software updates: @@ -52,19 +58,9 @@ PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](http://ca - CancerMine `v50` (2023-03) - UniProt KB `v2024_03` -- *February 2023*: **1.3.0 release** - - Details in [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html) - - prioritize protein-coding BIOTYPE csq ([pr201](https://github.com/sigven/pcgr/pull/201)) - - expose `--pcgrr_conda` option to flexibly activate pcgrr env via a non-default pcgrr name - - `cpsr_validate_input.py`: refactor for efficient custom gene egrep - -- *November 2022*: **1.2.0 release** - - Keep only autosomal, X, Y, M/MT chromosomes - - Import bcftools as dependency - ## Example reports -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12752833.svg)](https://doi.org/10.5281/zenodo.12752833) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13855988.svg)](https://doi.org/10.5281/zenodo.13855988) ## Why use PCGR? @@ -94,10 +90,13 @@ PCGR integrates a [comprehensive set of knowledge resources](articles/annotation ## Citation -If you use PCGR, please cite our publication: +If you use PCGR or CPSR, please cite our publications: Sigve Nakken, Ghislain Fournous, Daniel Vodák, Lars Birger Aaasheim, Ola Myklebost, and Eivind Hovig. **Personal Cancer Genome Reporter: variant interpretation report for precision oncology** (2017). *Bioinformatics*. 34(10):1778--1780. [doi.org/10.1093/bioinformatics/btx817](https://doi.org/10.1093/bioinformatics/btx817) +Sigve Nakken, Vladislav Saveliev, Oliver Hofmann, Pål Møller, Ola Myklebost, and Eivind Hovig. **Cancer Predisposition Sequencing Reporter (CPSR): a flexible variant report engine for high-throughput germline screening in cancer** (2021). *Int J Cancer*. [doi:[10.1002/ijc.33749](doi:%5B10.1002/ijc.33749)](https://doi.org/10.1002/ijc.33749) + + ## Contact sigven AT ifi.uio.no diff --git a/pcgrr/vignettes/CHANGELOG.Rmd b/pcgrr/vignettes/CHANGELOG.Rmd index 2ce8dd76..6332f29c 100644 --- a/pcgrr/vignettes/CHANGELOG.Rmd +++ b/pcgrr/vignettes/CHANGELOG.Rmd @@ -46,6 +46,34 @@ sigven <- user("sigven") pdiakumis <- user("pdiakumis") ``` +## v2.1.0 + +- Date: **2024-09-27** +- Major data updates + - ClinVar (2024-09) + - dbNSFP (v4.8) + - NCI Thesaurus `v24.07e` + - CIViC (2024-09-18) +- Reduction (~15%) in overall data bundle size - removed unused data files (e.g. expression counts) +- Fixed bug in MAF output for tumor-only runs `r issue(250)`, also ensure that non-exonic variants are +excluded if setting `--exclude_nonexonic` is used) +- Fixed bug in annotation of splice site mutation hotspots (e.g. MET exon 14 skipping) +- Highlighted variants with known pathogenic/likely pathogenic clinical significance in ClinVar (regardless of phenotype and variant origin) in the variant oncogenicity section of the HTML report +- Created interactive visualization support for allele-specific copy number data (HTML report) +- Slight change to the default transcript consequence pick order in VEP based on observations of prioritized transcripts (*mane_select > mane_plus_clinical > canonical > biotype > ccds > rank > tsl > appris >length*) +- Pulled in known oncogenic variants from ClinVar (assessed through ClinGen/CGC/VICC SOP, oncogenic/likely oncogenic) into the variant oncogenicity assessment algorithm +- Added option `--no_html` to disable HTML report generation +- Added option `--input_cpsr` - re-offering the possibility to integrate CPSR-classified germline variants in the PCGR HTML report +- Added `HGVSc_RefSeq` as output column in TSV/HTML - using MANE Select RefSeq transcript identifiers (works primarily for grch38) +- Pulled in coding sequence start annotation for protein-coding transcripts from GENCODE, enabling a more useful annotation of promoter variants (e.g. TERT) +- Created new column `ALTERATION` in variant tables of HTML report, a combination of `HGVSp`, `HGVSc` (if `HGVSp` not available) +- New output file for tumor-only runs, the complete set of calls, filtered and unfiltered, in a TSV file +- Re-processed all RNA-seq reference cohorts (TCGA, DepMap, TreeHouse), ensuring that all cohorts are using the same unit (`log2(TPM+0.001)`) +- Separated outlier gene expression results into separate tabs in the HTML report, +added them to Excel workbook output +- Added section on kataegis events in the HTML report +- Fixed bug in plotting of reference TMB distributions for different TMB algorithms (`--tmb_display` option) + ## v2.0.3 - Date: **2024-08-01** diff --git a/pcgrr/vignettes/annotation_resources.Rmd b/pcgrr/vignettes/annotation_resources.Rmd index 2fd79ad1..2d97b618 100644 --- a/pcgrr/vignettes/annotation_resources.Rmd +++ b/pcgrr/vignettes/annotation_resources.Rmd @@ -7,22 +7,22 @@ output: rmarkdown::html_document * [VEP](http://www.ensembl.org/info/docs/tools/vep/index.html) - Variant Effect Predictor release 112 ([GENCODE v46](https://www.gencodegenes.org/human/) as gene reference database (v19 for grch37)) ### *Insilico* predictions of effect of coding variants - * [dBNSFP](https://sites.google.com/site/jpopgen/dbNSFP) - database of non-synonymous functional predictions (v4.5, November 2023) + * [dBNSFP](https://sites.google.com/site/jpopgen/dbNSFP) - database of non-synonymous functional predictions (v4.8, June 2024) ### Variant frequency databases * [gnomAD](http://exac.broadinstitute.org/) - germline variant frequencies exome-wide (r2.1, October 2018) * [dbSNP](http://www.ncbi.nlm.nih.gov/SNP/) - database of short genetic variants (build 154) * [Cancer Hotspots](http://cancerhotspots.org) - a resource for statistically significant mutations in cancer (v2, 2017) - * [TCGA](https://portal.gdc.cancer.gov/) - somatic mutations discovered across 33 tumor type cohorts (release 39.0, December 2023) + * [TCGA](https://portal.gdc.cancer.gov/) - somatic mutations discovered across 33 tumor type cohorts (release 41.0, August 2024) ### Variant databases of clinical utility - * [ClinVar](http://www.ncbi.nlm.nih.gov/clinvar/) - database of clinically related variants (June 2024) - * [CIViC](https://civicdb.org) - clinical interpretations of variants in cancer (June 21st 2024) + * [ClinVar](http://www.ncbi.nlm.nih.gov/clinvar/) - database of clinically related variants (September 2024) + * [CIViC](https://civicdb.org) - clinical interpretations of variants in cancer (September 18th 2024) * [CGI](http://www.cancergenomeinterpreter.org/biomarkers) - Cancer Genome Interpreter Cancer Biomarkers Database (CGI) (October 18th 2022) ### Protein domains/functional features - * [UniProt/SwissProt KnowledgeBase](http://www.uniprot.org) - resource on protein sequence and functional information (2024_03) - * [Pfam](https://www.ebi.ac.uk/interpro/entry/pfam/#table) - database of protein families and domains (v35.0, November 2021) + * [UniProt/SwissProt KnowledgeBase](http://www.uniprot.org) - resource on protein sequence and functional information (2024_04) + * [Pfam](https://www.ebi.ac.uk/interpro/entry/pfam/#table) - database of protein families and domains (v37.0) ### Knowledge resources on gene and protein targets * [CancerMine](https://zenodo.org/records/7689627) - Literature-mined database of tumor suppressor genes/proto-oncogenes (v50, March 2023) @@ -34,15 +34,13 @@ output: rmarkdown::html_document __Genomic biomarkers__ -Genomic biomarkers included in PCGR are limited to the following: +Genomic biomarkers utilized in PCGR are currently limited to the following: * Evidence items for specific markers in CIViC must be *accepted* (*submitted* evidence items are not considered or shown) -* Markers reported at the exact variant level (e.g. __BRAF p.V600E__, __MET c.3028+1G>T, __g.7:140753336A>T__) +* Markers reported at the exact variant level (e.g. __BRAF p.V600E__, __MET c.3028+1G>T__, __g.7:140753336A>T__) * Markers reported at the codon level (e.g. __KRAS p.G12__) * Markers reported at the exon level (e.g. __KIT exon 11 mutation__, __EGFR exon 19 deletion__) * Markers reported at the gene level (e.g. __BRAF mutation__, __TP53 loss-of-function mutation__, __BRCA1 oncogenic mutation__) -* Within the [Cancer bioMarkers database (CGI)](https://www.cancergenomeinterpreter.org/biomarkers), only markers collected from FDA/NCCN guidelines, scientific literature, and clinical trials are included (markers collected from conference abstracts etc. are not included) +* Within the [Cancer bioMarkers database (CGI)](https://www.cancergenomeinterpreter.org/biomarkers), only biomarkers curated from FDA/NCCN guidelines, scientific literature, and clinical trials are included (biomarkers collected from conference abstracts etc. are not included) * Copy number gains/losses - -See also comment on a [closed GitHib issue](https://github.com/sigven/pcgr/issues/37#issuecomment-391966286) - +* RNA fusion and gene expression biomarkers are included in the PCGR reference databundle, but are not currently utilized in the PCGR biomarker matching procedure diff --git a/pcgrr/vignettes/input.Rmd b/pcgrr/vignettes/input.Rmd index e75a1418..1a9c3621 100644 --- a/pcgrr/vignettes/input.Rmd +++ b/pcgrr/vignettes/input.Rmd @@ -8,8 +8,8 @@ output: rmarkdown::html_document The PCGR workflow accepts three main input files: * An unannotated, single-sample [VCF file](https://github.com/samtools/hts-specs#variant-calling-data-files) (>= v4.2) with called somatic variants (SNVs/InDels) - * An allele-specific copy number segment file - * A file with transcript/gene expression levels + * A file with allele-specific copy number segments (tab-separated values - TSV) + * A file with transcript/gene expression levels (tab-separated values - TSV) The input VCF is a required input file, while the somatic copy number file and gene expression file are optional. The following arguments to the `pcgr` command are used for input files: @@ -17,7 +17,7 @@ The input VCF is a required input file, while the somatic copy number file and g * `--input_cna` * `--input_rna_expression` -In addition to these three main input files, the user can also opt to provide a [panel-of-normals VCF](#panel-of-normals-pon-vcf) file, as well as a [CPSR report (JSON)](#cpsr-report), as input. +In addition to these three main input files, the user can also opt to provide a [panel-of-normals VCF](#panel-of-normals-pon-vcf) file (`--pon_vcf`) for tumor-only variant filtering, as well as an input file with [CPSR-classified germline variants (TSV)](#germline-variants) (`--input_cpsr`). ### VCF @@ -25,9 +25,9 @@ In addition to these three main input files, the user can also opt to provide a * If the input VCF contains [multi-allelic sites](https://glow.readthedocs.io/en/latest/etl/variant-splitter.html), these will be subject to [decomposition](http://genome.sph.umich.edu/wiki/Vt#Decompose). Either way, we encourage that users prepare the input VCF _without the presence of multi-allelic sites_. * Variants used for reporting should be designated as `PASS` in the VCF FILTER column. Variants denoted with e.g. `Reject` as a FILTER value will not be subject to analysis in PCGR. For records with undefined values in the FILTER column (`'.'`), these will be considered as `PASS` variants. -#### Formatting of allelic depth/support (DP/AD) +#### Formatting of variant sequencing depth/allelic support (DP/AF) -The representation of variant genotype data (allelic depth and support in tumor vs. control sample) is usually formatted in the genotype fields of a VCF file, on a per-sample basis. However, considering the VCF output for the [numerous somatic SNV/InDel callers](https://www.biostars.org/p/19104/) that are in use, we have experienced a general lack of uniformity for how this information is encoded in the genotype fields. In order for PCGR to recognize this type of information robustly, we currently require that you encode this unambiguously in the INFO field of your VCF file. +The representation of variant genotype data (allelic depth and support in tumor vs. control sample) is usually formatted in the genotype fields of a VCF file, on a per-sample basis. However, considering the VCF output for the [numerous somatic SNV/InDel callers](https://www.biostars.org/p/19104/) that are in use, we have experienced a general lack of uniformity for how this information is encoded in the genotype fields. In order for PCGR to recognize this type of information robustly, we currently require that you as a user encode this unambiguously in the INFO field of your VCF file. Shown below is how the VCF header for these entries should look like in your input VCF, and how the corresponding variant data is encoded per record: @@ -55,8 +55,8 @@ As an effort to support the users with this procedure, we are hoping to establis #### Other notes regarding input VCF -* PCGR generates a number of VCF INFO annotation tags that are appended to the query VCF. We therefore encourage the users to submit query VCF files that have not been subject to annotations by other means, but rather a VCF file that comes directly from variant calling. If not, there are likely to be INFO tags in the query VCF file that coincide with those produced by PCGR. -* Note that you can preserve particular tags in the INFO field towards the TSV output of PCGR. Sometimes, it can be convenient to investigate particular properties of the variants (encoded in the VCF) against functional annotations (as provided by PCGR). To achieve this, use the option `--preserved_info_tags , etc`. +* PCGR generates a number of VCF INFO annotation tags that are appended to the query VCF. We therefore encourage the users to submit query VCF files that _have not_ been subject to annotations by other means, but rather a VCF file that comes directly from variant calling. If not, there are likely to be INFO tags in the query VCF file that coincide with those produced by PCGR. +* Note that you can preserve particular tags in the INFO field towards the TSV output of PCGR. Sometimes, it can be convenient to investigate particular properties of the variants (encoded in the VCF) against functional annotations (as provided by PCGR). To achieve this, use the option `--retained_info_tags ,` etc. ### Panel-of-normals VCF @@ -78,7 +78,7 @@ The PoN VCF file needs to contain the following. ### Copy number segments - allele-specific -The tab-separated values file with allele-specific copy number aberrations __MUST__ contain the following four columns: +A tab-separated values file with allele-specific copy number aberrations __MUST__ contain the following four columns: * `Chromosome` * `Start` @@ -98,9 +98,13 @@ Importantly, you can configure predefined thresholds for segments that are consi * `--n_copy_gain`: minimum (total) copy number for a segment to be considered an amplification, _default 6_ +### Germline variants + +The user can submit a file with germline variants processed and classified with [CPSR](https://sigven.github.io/cpsr), which caters for an integration of germline and somatic findings in the output report. This file corresponds to output file `.cpsr..classification.tsv.gz` from the CPSR pipeline. Make sure the genome assembly is the same as the one used for the somatic variant input files. + ### Gene expression -The user can submit a file with bulk gene/transcript expression data to PCGR, indicating the relative expression levels of genes in the query sample. These data will be integrated with the variant calls in the output report, and also subject to an RNA expression similarity analysis with other tumor samples. +The user can submit a file with bulk gene/transcript expression data to PCGR, indicating the relative expression levels of genes in the query sample. PCGR may conduct an expression outlier analysis (compared against other cohorts of samples), and also perform an RNA expression similarity analysis, i.e. correlation of expression profile with other tumor samples. The tab-separated values file with gene expression estimates __MUST__ contain the following two columns: diff --git a/pcgrr/vignettes/installation.Rmd b/pcgrr/vignettes/installation.Rmd index 0a2c6fe0..5cca9cba 100644 --- a/pcgrr/vignettes/installation.Rmd +++ b/pcgrr/vignettes/installation.Rmd @@ -16,7 +16,7 @@ require(glue, include.only = "glue") ```{r vars, echo=FALSE} Sys.setenv(VEP_VERSION = "112") Sys.setenv(PCGR_VERSION = "2.0.3") -Sys.setenv(BUNDLE_VERSION = "20240621") +Sys.setenv(BUNDLE_VERSION = "20240927") VEP_VERSION <- Sys.getenv("VEP_VERSION") PCGR_VERSION <- Sys.getenv("PCGR_VERSION") BUNDLE_VERSION <- Sys.getenv("BUNDLE_VERSION") @@ -46,7 +46,7 @@ need to match the chosen assembly. Reference bundles are generated semi-automatically (by the PCGR author) and are versioned based on their release date. Keep in mind that the bundles support only certain Ensembl VEP versions. The latest (**v`r BUNDLE_VERSION`**) genome-specific -bundles can be downloaded directly from below (size: ~5G): +bundles can be downloaded directly from below (size: ~4G): | Assembly | Download Link | |:---------|:--------------------------| @@ -212,6 +212,11 @@ docker container run -it --rm \ --vcf2maf ``` +**NOTE**: If you need to run the Docker-based version of PCGR as a non-root user, +you may need to explicitly add options for quarto to work properly, i.e. +`--env "XDG_CACHE_HOME=/tmp/quarto_cache_home"` (same as for Singularity/Apptainer below, see also [issue #246](https://github.com/sigven/pcgr/issues/246)). +

+ ### C. Singularity/Apptainer The PCGR Singularity/Apptainer image is available on [GitHub Container Registry](https://ghcr.io/sigven/pcgr). diff --git a/pcgrr/vignettes/output.Rmd b/pcgrr/vignettes/output.Rmd index 2bb40d6a..7aa984c0 100644 --- a/pcgrr/vignettes/output.Rmd +++ b/pcgrr/vignettes/output.Rmd @@ -26,8 +26,8 @@ by the user. The following sections may be included in the report: * Provides an overview of the somatic SNVs and InDels detected in the tumor sample * Includes a global distribution of allelic support, statistics with respect to variant types and consequences - * Variants are classified with respect to _oncogenicity_ (VICC/ClinGen/CGC operating procedures) - - permits exploration of somatic mutations through interactive filtering according to several dimensions (variant sequencing depth/support, variant consequence etc.) + * Variants are classified with respect to _oncogenicity_ (ClinGen/CGC/VICC standard operating procedures) + - permits also exploration of somatic mutations through interactive filtering according to several dimensions (variant sequencing depth/support, variant consequence etc.) * Variants are classified with respect to _actionability_ (AMP/ASCO/CAP guidelines) - individual evidence items linked to actionable variants can be explored, indicating strength of evidence, tumor type and therapeutic context, and clinical significance @@ -60,13 +60,7 @@ by the user. The following sections may be included in the report: #### Example reports -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5045309.svg)](https://doi.org/10.5281/zenodo.5045309) - -The HTML reports have been tested using the following browsers: - -* Safari (Version 14.1.1 (16611.2.7.1.4)) -* Mozilla Firefox (83.0) -* Google Chrome (Version 90.0.4430.212 (Official Build) (x86\_64)) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12752833.svg)](https://doi.org/10.5281/zenodo.12752833) ### SNVs/InDels @@ -91,6 +85,7 @@ A VCF file containing annotated, somatic calls (single nucleotide variants and i | `CDS_position` | Relative position of base pair in coding sequence (picked by VEP's `--flag_pick_allele` option) | | `CDS_RELATIVE_POSITION` | Ratio of variant coding position to length of coding sequence | | `CDS_CHANGE` | Coding, transcript-specific sequence annotation (picked by VEP's `--flag_pick_allele` option) | +| `ALTERATION` | HGVSp/HGVSc identifier | | `AMINO_ACID_START` | Protein position indicating absolute start of amino acid altered (fetched from `Protein_position`) | | `AMINO_ACID_END` | Protein position indicating absolute end of amino acid altered (fetched from `Protein_position`) | | `Protein_position` | Relative position of amino acid in protein (picked by VEP's `--flag_pick_allele` option) | @@ -107,6 +102,7 @@ A VCF file containing annotated, somatic calls (single nucleotide variants and i | `TREMBL` | Best match UniProtKB/TrEMBL accession of protein product (picked by VEP's `--flag_pick_allele` option) | | `UNIPARC` | Best match UniParc accession of protein product (picked by VEP's `--flag_pick_allele` option) | | `HGVSc` | The HGVS coding sequence name (picked by VEP's `--flag_pick_allele` option) | +| `HGVSc_RefSeq`| The HGVSc coding sequence name using RefSeq transcript identifiers (MANE select) - picked by VEP's `--flag_pick_allele` option) | | `HGVSp` | The HGVS protein sequence name (picked by VEP's `--flag_pick_allele` option) | | `HGVSp_short` | The HGVS protein sequence name, short version (picked by VEP's `--flag_pick_allele` option) | | `HGVS_OFFSET` | Indicates by how many bases the HGVS notations for this variant have been shifted (picked by VEP's `--flag_pick_allele` option) | @@ -149,15 +145,16 @@ A VCF file containing annotated, somatic calls (single nucleotide variants and i |-----|-------------| | `ENTREZGENE` | [Entrez](http://www.ncbi.nlm.nih.gov/gene) gene identifier | | `APPRIS` | Principal isoform flags according to the [APPRIS principal isoform database](http://appris.bioinfo.cnio.es/#/downloads) | -| `MANE_SELECT` | Indicating if the transcript is the MANE Select or MANE Plus Clinical transcript for the gene (picked by VEP's `--flag_pick_allele_gene` option) | +| `MANE_SELECT` | Indicating if the transcript is the MANE Select for the gene (picked by VEP's `--flag_pick_allele_gene` option) | +| `MANE_PLUS_CLINICAL` | Indicating if the transcript is MANE Plus Clinical, as required for clinical variant reporting (picked by VEP's `--flag_pick_allele_gene` option) | | `UNIPROT_ID` | [UniProt](http://www.uniprot.org) identifier | | `UNIPROT_ACC` | [UniProt](http://www.uniprot.org) accession(s) | | `ENSEMBL_GENE_ID` | Ensembl gene identifier for VEP's picked transcript (*ENSGXXXXXXX*) | | `ENSEMBL_TRANSCRIPT_ID` | Ensembl transcript identifier for VEP's picked transcript (*ENSTXXXXXX*) | | `ENSEMBL_PROTEIN_ID` | Ensembl corresponding protein identifier for VEP's picked transcript (*ENSPXXXXXX*) | | `REFSEQ_TRANSCRIPT_ID` | Corresponding RefSeq transcript(s) identifier for VEP's picked transcript (*NM_XXXXX*) | -| `TRANSCRIPT_MANE_SELECT` | MANE select transcript identifer: one high-quality representative transcript per protein-coding gene that is well-supported by experimental data and represents the biology of the gene | -| `TRANSCRIPT_MANE_PLUS_CLINICAL` | transcripts chosen to supplement MANE Select when needed for clinical variant reporting | +| `MANE_SELECT2` | MANE select transcript identifer: one high-quality representative transcript per protein-coding gene that is well-supported by experimental data and represents the biology of the gene - provided through BioMart | +| `MANE_PLUS_CLINICAL2` | transcripts chosen to supplement MANE Select when needed for clinical variant reporting - provided through BioMart | | `GENCODE_TAG` | tag for gencode transcript (basic etc) | | `GENCODE_TRANSCRIPT_TYPE` | type of transcript (protein-coding etc.) | | `TSG` | Flag indicating whether gene is predicted as a tumor suppressor gene, from Cancer Gene Census, Network of Cancer Genes (NCG) & the CancerMine text-mining resource | @@ -188,19 +185,27 @@ A VCF file containing annotated, somatic calls (single nucleotide variants and i | `MUTATION_HOTSPOT_CANCERTYPE` | hotspot-associated cancer types (from cancerhotspots.org) | | `PFAM_DOMAIN` | Pfam domain identifier (from VEP) | | `INTOGEN_DRIVER_MUT` | Indicates if existing variant is predicted as driver mutation from IntoGen Catalog of Driver Mutations | -| `EFFECT_PREDICTIONS` | All predictions of effect of variant on protein function and pre-mRNA splicing from [database of non-synonymous functional predictions - dbNSFP v4.4](https://sites.google.com/site/jpopgen/dbNSFP). Predicted effects are provided by different sources/algorithms (separated by `&`), `T` = Tolerated, `N` = Neutral, `D` = Damaging: 1.[SIFT](https://sift.bii.a-star.edu.sg/), 2.[MutationTaster](http://www.mutationtaster.org/) (data release Nov 2015), 3.[MutationAssessor](http://mutationassessor.org/) (release 3), 4.[FATHMM](http://fathmm.biocompute.org.uk) (v2.3), 5.[PROVEAN](http://provean.jcvi.org/index.php) (v1.1 Jan 2015), 6.[FATHMM\_MKL](http://fathmm.biocompute.org.uk/fathmmMKL.htm), 7.[PRIMATEAI](https://www.nature.com/articles/s41588-018-0167-z), 8.[DEOGEN2](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5570203/), 9.[DBNSFP\_CONSENSUS\_RNN](https://www.biorxiv.org/content/10.1101/2021.04.09.438706v1) (Ensembl/consensus prediction, based on deep learning), 10.[SPLICE\_SITE\_EFFECT\_ADA](http://nar.oxfordjournals.org/content/42/22/13534) (Ensembl/consensus prediction of splice-altering SNVs, based on adaptive boosting), 11.[SPLICE\_SITE\_EFFECT\_RF](http://nar.oxfordjournals.org/content/42/22/13534) (Ensembl/consensus prediction of splice-altering SNVs, based on random forest), 12.[M-CAP](http://bejerano.stanford.edu/MCAP), 13.[MutPred](http://mutpred.mutdb.org), 14.[GERP](http://mendel.stanford.edu/SidowLab/downloads/gerp/), 15.[BayesDel](https://doi.org/10.1002/humu.23158), 16.[LIST-S2](https://doi.org/10.1093/nar/gkaa288), 17.[ALoFT](https://www.nature.com/articles/s41467-017-00443-5) | +| `EFFECT_PREDICTIONS` | All predictions of effect of variant on protein function and pre-mRNA splicing from [database of non-synonymous functional predictions - dbNSFP v4.8/dbscSNV](https://sites.google.com/site/jpopgen/dbNSFP). Predicted effects are provided by different sources/algorithms (separated by `&`), `T` = Tolerated, `N` = Neutral, `D` = Damaging: 1.[SIFT](https://sift.bii.a-star.edu.sg/), 2.[MutationTaster](http://www.mutationtaster.org/) (data release Nov 2015), 3.[MutationAssessor](http://mutationassessor.org/) (release 3), 4.[FATHMM](http://fathmm.biocompute.org.uk) (v2.3), 5.[PROVEAN](http://provean.jcvi.org/index.php) (v1.1 Jan 2015), 6.[FATHMM\_MKL](http://fathmm.biocompute.org.uk/fathmmMKL.htm), 7.[PRIMATEAI](https://www.nature.com/articles/s41588-018-0167-z), 8.[DEOGEN2](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5570203/), 9.[DBNSFP\_CONSENSUS\_RNN](https://www.biorxiv.org/content/10.1101/2021.04.09.438706v1) (Ensembl/consensus prediction, based on deep learning), 10.[SPLICE\_SITE\_EFFECT\_ADA](http://nar.oxfordjournals.org/content/42/22/13534) (Ensembl/consensus prediction of splice-altering SNVs, based on adaptive boosting), 11.[SPLICE\_SITE\_EFFECT\_RF](http://nar.oxfordjournals.org/content/42/22/13534) (Ensembl/consensus prediction of splice-altering SNVs, based on random forest), 12.[M-CAP](http://bejerano.stanford.edu/MCAP), 13.[MutPred](http://mutpred.mutdb.org), 14.[GERP](http://mendel.stanford.edu/SidowLab/downloads/gerp/), 15.[BayesDel](https://doi.org/10.1002/humu.23158), 16.[LIST-S2](https://doi.org/10.1093/nar/gkaa288), 17.[ALoFT](https://www.nature.com/articles/s41467-017-00443-5), +18.[AlphaMissense](https://console.cloud.google.com/storage/browser/dm_alphamissense), +19.[ESM1b](https://huggingface.co/spaces/ntranoslab/esm_variants/tree/main), +20.[PHACTboost](https://github.com/CompGenomeLab/PHACTboost), +21.[MutFormer](https://github.com/WGLab/mutformer)| | `DBNSFP_BAYESDEL_ADDAF` | predicted effect from BayesDel (dbNSFP) | | `DBNSFP_LIST_S2` | predicted effect from LIST-S2 (dbNSFP) | | `DBNSFP_SIFT` | predicted effect from SIFT (dbNSFP) | | `DBNSFP_PROVEAN` | predicted effect from PROVEAN (dbNSFP) | -| `DBNSFP_MUTATIONTASTER` | predicted effect from MUTATIONTASTER (dbNSFP) | -| `DBNSFP_MUTATIONASSESSOR` | predicted effect from MUTATIONASSESSOR (dbNSFP) | +| `DBNSFP_MUTATIONTASTER` | predicted effect from MutationTaster (dbNSFP) | +| `DBNSFP_MUTATIONASSESSOR` | predicted effect from MutationAssessor (dbNSFP) | | `DBNSFP_M_CAP` | predicted effect from M-CAP (dbNSFP) | | `DBNSFP_ALOFTPRED` | predicted effect from ALoFT (dbNSFP) | -| `DBNSFP_MUTPRED` | score from MUTPRED (dbNSFP) | +| `DBNSFP_MUTPRED` | score from MutPred (dbNSFP) | | `DBNSFP_FATHMM` | predicted effect from FATHMM (dbNSFP) | -| `DBNSFP_PRIMATEAI` | predicted effect from PRIMATEAI (dbNSFP) | -| `DBNSFP_DEOGEN2` | predicted effect from DEOGEN2 (dbNSFP) | +| `DBNSFP_PRIMATEAI` | predicted effect from PrimateAI (dbNSFP) | +| `DBNSFP_DEOGEN2` | predicted effect from deogen2 (dbNSFP) | +| `DBNSFP_PHACTBOOST` | predicted effect from PHACTboost (dbNSFP) | +| `DBNSFP_ALPHA_MISSENSE` | predicted effect from AlphaMissense (dbNSFP) | +| `DBNSFP_MUTFORMER` | predicted effect from MutFormer (dbNSFP) | +| `DBNSFP_ESM1B` | predicted effect from ESM1b (dbNSFP) | | `DBNSFP_GERP` | evolutionary constraint measure from GERP (dbNSFP) | | `DBNSFP_FATHMM_MKL` | predicted effect from FATHMM-mkl (dbNSFP) | | `DBNSFP_META_RNN` | predicted effect from ensemble prediction (deep learning - dbNSFP) | @@ -240,15 +245,16 @@ A VCF file containing annotated, somatic calls (single nucleotide variants and i | `CLINVAR_MEDGEN_CUI_SOMATIC` | Associated [MedGen](https://www.ncbi.nlm.nih.gov/medgen/) concept identifiers (_CUIs_) - somatic state-of-origin | | `CLINVAR_VARIANT_ORIGIN` | Origin of variant (somatic, germline, de novo etc.) for variant in [ClinVar](http://www.ncbi.nlm.nih.gov/clinvar) | | `CLINVAR_REVIEW_STATUS_STARS` | Rating of the [ClinVar](http://www.ncbi.nlm.nih.gov/clinvar) variant (0-4 stars) with respect to level of review | +| `CLINVAR_KNOWN_ONCOGENIC` | Variant matches with known oncogenic variants in [ClinVar](http://www.ncbi.nlm.nih.gov/clinvar), through ClinGen/CGC/VICC SOP. Format: ||||. Example: NRAS|p.Q61R|c.182A>G|Oncogenic|by_genomic_coord&by_hgvsc_principal&by_hgvsp_principal | ##### _Other_ | Tag | Description | |-----|-------------| | `BIOMARKER_MATCH` | Variant matches with biomarker evidence in CIViC/CGI. Format: \|\|::::\|. Multiple evidence items are separated by '&'. Example: civic|174|EID445:Colon/Rectum:Sensitivity/Response:D:Predictive:Somatic&EID446:Colon/Rectum:Sensitivity/Response:D:Predictive:Somatic|by_gene_mut. Matching type can be any of `by_genomic_coord`, `by_hgvsp_principal`, `by_hgvsc_principal`, `by_hgvsp_nonprincipal`, `by_hgvsc_nonprincipal`, `by_codon_principal`, `by_exon_mut_principal`, `by_gene_mut_lof`, `by_gene_mut` | -| `ONCOGENICITY` | Oncogenicity annotation - VICC/ClinGen SOP implementation | -| `ONCOGENICITY_CODE` | Oncogenicity code - VICC/ClinGen SOP implementation | -| `ONCOGENICITY_SCORE` | Oncogenicity score - VICC/ClinGen SOP implementation | +| `ONCOGENICITY` | Oncogenicity annotation - ClinGen/CGC/VICC SOP implementation | +| `ONCOGENICITY_CODE` | Oncogenicity code - ClinGen/CGC/VICC SOP implementation | +| `ONCOGENICITY_SCORE` | Oncogenicity score - ClinGen/CGC/VICC SOP implementation | #### 2. Tab-separated values (TSV) @@ -268,61 +274,71 @@ The following variables are included in the TSV file (VCF tags issued by the use | 6. `ENTREZGENE` | Entrez gene identifier | | 7. `ENSEMBL_GENE_ID` | Ensembl gene identifier | | 8. `GENENAME` | Gene name | -| 9. `PROTEIN_CHANGE` | Protein change | -| 10. `CONSEQUENCE` | Variant consequence - from VEP | -| 11. `PFAM_DOMAIN_NAME` | Pfam domain name | -| 12. `LOSS_OF_FUNCTION` | Loss of function flag | -| 13. `CDS_CHANGE` | Coding sequence change | -| 14. `CODING_STATUS` | Coding status - flag indicating if consequence is protein-altering/affecting splice sites| -| 15. `EXONIC_STATUS` | Exonic status - flag indicating if consequence is silent/protein-altering/affecting splice sites| -| 16. `DP_TUMOR` | Depth of coverage at variant position in tumor sample | -| 17. `VAF_TUMOR` | Variant allele fraction at variant position in tumor sample | -| 18. `DP_CONTROL` | Depth of coverage at variant position in control sample | -| 19. `VAF_CONTROL` | Variant allele fraction at variant position in control sample | -| 20. `MUTATION_HOTSPOT` | Mutation hotspot annotation | -| 21. `MUTATION_HOTSPOT_CANCERTYPE` | Mutation hotspot-associated cancer types (from cancerhotspots.org) | -| 22. `ACTIONABILITY_TIER` | Actionability tier - AMP/ASCO/CAP implementation | -| 23. `ACTIONABILITY` | Actionability annotation - AMP/ASCO/CAP implementation | -| 24. `ACTIONABILITY_FRAMEWORK` | Actionability framework - AMP/ASCO/CAP implementation | -| 25. `ONCOGENICITY` | Oncogenicity annotation - VICC/ClinGen/CGC SOP implementation | -| 26. `ONCOGENICITY_CODE` | Oncogenicity code - VICC/ClinGen/CGC SOP implementation | -| 27. `ONCOGENICITY_SCORE` | Oncogenicity score - VICC/ClinGen/CGC SOP implementation | -| 28. `HGVSc` | HGVS coding sequence name | -| 29. `HGVSp` | HGVS protein sequence name | -| 30. `CANONICAL` | Flag indicating if transcript is canonical | -| 31. `CCDS` | CCDS identifier | -| 32. `UNIPROT_ACC` | UniProt accession | -| 33. `ENSEMBL_TRANSCRIPT_ID` | Ensembl transcript identifier | -| 34. `ENSEMBL_PROTEIN_ID` | Ensembl protein identifier | -| 35. `REFSEQ_TRANSCRIPT_ID` | RefSeq transcript identifier | -| 36. `REFSEQ_PROTEIN_ID` | RefSeq protein identifier | -| 37. `TRANSCRIPT_MANE_SELECT` | MANE transcript select | -| 38. `TRANSCRIPT_MANE_PLUS_CLINICAL` | MANE transcript plus clinical | -| 39. `CGC_TIER` | Cancer Gene Census tier | -| 40. `CGC_GERMLINE` | Cancer Gene Census germline annotation | -| 41. `CGC_SOMATIC` | Cancer Gene Census somatic annotation | -| 42. `ONCOGENE` | Flag indicating if gene is oncogene (CGC/CancerMine/NCG) | -| 43. `ONCOGENE_SUPPORT` | Oncogene annotation support (CGC/CancerMine/NCG) | -| 44. `TUMOR_SUPPRESSOR` | Flag indicating if gene is tumor suppressor (CGC/CancerMine/NCG) | -| 45. `TUMOR_SUPPRESSOR_SUPPORT` | Tumor suppressor annotation support (CGC/CancerMine/NCG) | -| 46. `TARGETED_INHIBITORS2` | Targeted inhibitors | -| 47. `EFFECT_PREDICTIONS` | Variant effect predictions - from dbNSFP | -| 48. `REGULATORY_ANNOTATION` | Regulatory annotation | -| 49. `VEP_ALL_CSQ` | VEP consequence - all transcripts | -| 50. `gnomADe_AF` | gnomAD exomes allele frequency - globally| -| 51. `DBSNP_RSID` | dbSNP identifier | -| 52. `COSMIC_ID` | COSMIC identifier | -| 53. `TCGA_FREQUENCY` | Frequency of variant across TCGA tumor types. Format: `tumortype | percent affected | affected cases | total cases` | -| 54. `TCGA_PANCANCER_COUNT` | Raw variant count across all TCGA tumor types | -| 55. `CLINVAR_MSID` | ClinVar MedGen identifier | -| 56. `CLINVAR_CLASSIFICATION` | ClinVar variant classification | -| 57. `CLINVAR_VARIANT_ORIGIN` | ClinVar variant origin | -| 58. `CLINVAR_NUM_SUBMITTERS` | ClinVar number of submitters | -| 59. `CLINVAR_REVIEW_STATUS_STARS` | ClinVar number of review status stars | -| 60. `CLINVAR_CONFLICTED` | ClinVar variant classification is conflicted | -| 61. `BIOMARKER_MATCH` | Biomarker match | -| 62. `CALL_CONFIDENCE` | Call confidence | - +| 9. `ALTERATION` | Combined HGVSp/HGVSc annotation | +| 10. `PROTEIN_CHANGE` | Protein change | +| 11. `CONSEQUENCE` | Variant consequence - from VEP | +| 12. `PFAM_DOMAIN_NAME` | Pfam domain name | +| 13. `LOSS_OF_FUNCTION` | Loss of function flag | +| 14. `LOF_FILTER` | Loss of function filter | +| 15. `CDS_CHANGE` | Coding sequence change | +| 16. `CODING_STATUS` | Coding status - flag indicating if consequence is protein-altering/affecting splice sites| +| 17. `EXONIC_STATUS` | Exonic status - flag indicating if consequence is silent/protein-altering/affecting splice sites| +| 18. `DP_TUMOR` | Depth of coverage at variant position in tumor sample | +| 19. `VAF_TUMOR` | Variant allele fraction at variant position in tumor sample | +| 20. `DP_CONTROL` | Depth of coverage at variant position in control sample | +| 21. `VAF_CONTROL` | Variant allele fraction at variant position in control sample | +| 22. `MUTATION_HOTSPOT` | Mutation hotspot annotation | +| 23. `MUTATION_HOTSPOT_CANCERTYPE` | Mutation hotspot-associated cancer types (from cancerhotspots.org) | +| 24. `ACTIONABILITY_TIER` | Actionability tier - AMP/ASCO/CAP implementation | +| 25. `ACTIONABILITY` | Actionability annotation - AMP/ASCO/CAP implementation | +| 26. `ACTIONABILITY_FRAMEWORK` | Actionability framework - AMP/ASCO/CAP implementation | +| 27. `ONCOGENICITY` | Oncogenicity annotation - ClinGen/CGC/VICC/CGC SOP implementation | +| 28. `ONCOGENICITY_CODE` | Oncogenicity code - ClinGen/CGC/VICC/CGC SOP implementation | +| 29. `ONCOGENICITY_SCORE` | Oncogenicity score - ClinGen/CGC/VICC/CGC SOP implementation | +| 30. `HGVSc` | HGVS coding sequence name | +| 31. `HGVSc_RefSeq` | HGVS coding sequence name (RefSeq) | +| 32. `HGVSp` | HGVS protein sequence name | +| 33. `CANONICAL` | Flag indicating if transcript is canonical | +| 34. `CCDS` | CCDS identifier | +| 35. `UNIPROT_ACC` | UniProt accession | +| 36. `ENSEMBL_TRANSCRIPT_ID` | Ensembl transcript identifier | +| 37. `ENSEMBL_PROTEIN_ID` | Ensembl protein identifier | +| 38. `REFSEQ_TRANSCRIPT_ID` | RefSeq transcript identifier | +| 39. `REFSEQ_PROTEIN_ID` | RefSeq protein identifier | +| 40. `MANE_SELECT` | MANE transcript select | +| 41. `MANE_PLUS_CLINICAL` | MANE transcript plus clinical | +| 42. `CGC_TIER` | Cancer Gene Census tier | +| 43. `CGC_GERMLINE` | Cancer Gene Census germline annotation | +| 44. `CGC_SOMATIC` | Cancer Gene Census somatic annotation | +| 45. `ONCOGENE` | Flag indicating if gene is oncogene (CGC/CancerMine/NCG) | +| 46. `ONCOGENE_SUPPORT` | Oncogene annotation support (CGC/CancerMine/NCG) | +| 47. `TUMOR_SUPPRESSOR` | Flag indicating if gene is tumor suppressor (CGC/CancerMine/NCG) | +| 48. `TUMOR_SUPPRESSOR_SUPPORT` | Tumor suppressor annotation support (CGC/CancerMine/NCG) | +| 49. `TARGETED_INHIBITORS2` | Targeted inhibitors | +| 50. `EFFECT_PREDICTIONS` | Variant effect predictions - from dbNSFP | +| 51. `REGULATORY_ANNOTATION` | Regulatory annotation | +| 52. `VEP_ALL_CSQ` | VEP consequence - all transcripts | +| 53. `gnomADe_AF` | gnomAD exomes allele frequency - globally| +| 54. `DBSNP_RSID` | dbSNP identifier | +| 55. `COSMIC_ID` | COSMIC identifier | +| 56. `TCGA_FREQUENCY` | Frequency of variant across TCGA tumor types. Format: `tumortype | percent affected | affected cases | total cases` | +| 57. `TCGA_PANCANCER_COUNT` | Raw variant count across all TCGA tumor types | +| 58. `CLINVAR_MSID` | ClinVar MedGen identifier | +| 59. `CLINVAR_CLASSIFICATION` | ClinVar variant classification | +| 60. `CLINVAR_VARIANT_ORIGIN` | ClinVar variant origin | +| 61. `CLINVAR_NUM_SUBMITTERS` | ClinVar number of submitters | +| 62. `CLINVAR_REVIEW_STATUS_STARS` | ClinVar number of review status stars | +| 63. `CLINVAR_CONFLICTED` | ClinVar variant classification is conflicted | +| 64. `BIOMARKER_MATCH` | Biomarker match | +| 65. `CALL_CONFIDENCE` | Call confidence | + +For tumor-only runs, we provide a similarly formatted tab-separated values file that include +both filtered (i.e. likely germline events) and unfiltered (deemed somatic) variants. The file has the following naming convention: + +- `.pcgr..snv_indel_filtered.ann.tsv.gz` + +In this TSV file, an additional column `SOMATIC_CLASSIFICATION` indicates for each variant if it is +classified as somatic or germline. ### Tumor mutational burden (TSV) @@ -375,13 +391,14 @@ The format of the TSV file is the following: #### 1. Tab-separated values (TSV) -Copy number segments are intersected with the genomic coordinates of all transcripts from [GENCODE's basic gene annotation](https://www.gencodegenes.org/releases/current.html). In addition, PCGR attaches cancer-relevant annotations for the affected transcripts. The naming convention of the compressed TSV file is as follows: +Copy number segments are intersected with the genomic coordinates of all transcripts from [GENCODE's basic gene annotation](https://www.gencodegenes.org/releases/current.html). In addition, PCGR attaches cancer-relevant annotations for the affected transcripts. The naming convention of the compressed TSV files are as follows: +- `.pcgr..cna_segment.tsv.gz` + - segment level information only - `.pcgr..cna_gene_ann.tsv.gz` - - __NOTE__: This file is organized according to the _affected transcripts_ (i.e. one line/record per affected transcript). - -The format of the compressed TSV file is the following: + - This file is organized according to the _affected transcripts_ (i.e. one line/record per affected transcript, segments not overlapping with any transcripts will thus not be included in this files). +The format of the compressed `cna_gene_ann.tsv.gz` is the following: | Variable | Description | |----------|-------------| @@ -417,7 +434,7 @@ If users provide bulk RNA-seq expression data as input, PCGR will attach basic g * `.pcgr..expression_similarity.tsv.gz` - __NOTE__: This file is organized according to the _samples_ of other gene expression cohorts (i.e. similarity level, one line/record per sample). * `.pcgr..expression_outliers.tsv.gz` - - __NOTE__: This file is organized according to the _genes_ which are considered outliers with respect to the distribution of expression levels found in reference cohorts (one line/record per affected outlier/gene). + - __NOTE__: This file is organized according to how the expression levels of _genes/transcripts_ compare to the distribution of expression levels found in reference cohorts. This files contain various statistics in this respect (e.g. z-scores, IQR, Q1, Q2, Q3, percentile etc), enabling the detection of expression outliers. ### Excel workbook (XLSX) diff --git a/pcgrr/vignettes/running.Rmd b/pcgrr/vignettes/running.Rmd index bed9cb36..3b484ac2 100644 --- a/pcgrr/vignettes/running.Rmd +++ b/pcgrr/vignettes/running.Rmd @@ -25,7 +25,7 @@ Ideally, this should reflect the _callable_ target size of the assay, i.e. the s By default, PCGR expects that the input VCF contains somatic variants identified from a tumor-control sequencing setup. This implies that the VCF contains information with respect to variant allelic depth/support both for the tumor sample and the corresponding control sample. -If the input VCF comes from a __tumor-only__ assay, turn on the `--tumor_only` option. In this mode, PCGR conducts a set of successive filtering steps on the raw input set of variants, aiming to exclude the majority of germline variants from the tumor-only input set. In addition to default filtering applied against variants found in the gnomAD database (population-specific allele frequency thresholds can be configured, see below), additional filtering procedures can be explicitly set, i.e.: +If the input VCF comes from a __tumor-only__ assay, turn on the `--tumor_only` option. In this mode, PCGR conducts a set of successive filtering steps on the raw input set of variants, aiming to exclude the majority of germline variants from the tumor-only input set. In addition to default filtering applied against variants found in the gnomAD database (population-specific minor allele frequency thresholds can be configured, see below), additional filtering procedures can be explicitly set, i.e.: * `--exclude_dbsnp_nonsomatic` * `--exclude_likely_het_germline` @@ -76,7 +76,7 @@ If these tags are set correctly, one may set thresholds (sequencing depth and/or - `--control_dp_min ` - `--control_af_max ` -#### Tumor mutational burden (TMB) +### Tumor mutational burden (TMB) If tags for allelic support is provided in the VCF and configured by the user, users can configure the TMB calculation by setting minimum requirements for sequencing coverage and allelic fraction, i.e.: @@ -89,11 +89,16 @@ For input VCF files with > 500,000 variants, note that these will be subject to If you have a large input VCF, and have sufficient memory capacity on your compute platform, we also recommend to increase the [VEP buffer size](https://www.ensembl.org/info/docs/tools/vep/script/vep_options.html#cacheopt) (option `--vep_buffer_size`), as this will speed up the VEP processing significantly. +### Configuration of output files + +If you only want the Excel/TSV output (with all variant classifications and auxiliary analyses (e.g. MSI classifications, TMB estimates, mutational signatures predictions) of PCGR, you may turn off the HTML output by using the `--no_html` option. This will speed up the analysis, and could be a useful option if you foresee that the sample input datasets are too large for the HTML report generation to work properly. + +If you only want the annotated VCF (also converted to TSV), use the option `--no_reporting`. This will skip the final steps of the PCGR workflow, and only generate the annotated VCF and TSV files. + ## All options A tumor sample report is generated by running the __pcgr__ command, which takes the following arguments and options: - ```text usage: pcgr -h [options] @@ -226,7 +231,8 @@ VEP options: - set lower to reduce memory usage, default: 500 --vep_pick_order VEP_PICK_ORDER Comma-separated string of ordered transcript/variant properties for selection of primary variant consequence - (option '--pick_order' in VEP), default: mane_select,mane_plus_clinical,canonical,appris,tsl,biotype,ccds,rank,length + (option '--pick_order' in VEP), default: + mane_select,mane_plus_clinical,canonical,biotype,ccds,rank,tsl,appris,length --vep_no_intergenic Skip intergenic variants during variant annotation (VEP option '--no_intergenic' in VEP), default: False --vep_regulatory Add VEP regulatory annotations (VEP option '--regulatory') or non-coding interpretation, default: False --vep_gencode_basic Consider basic GENCODE transcript set only with Variant Effect Predictor (VEP) (VEP option '--gencode_basic'). @@ -268,6 +274,13 @@ Bulk RNA-seq and RNA fusion data options: --expression_sim_db EXPRESSION_SIM_DB Comma-separated string of databases for used in RNA expression similarity analysis, default: tcga,depmap,treehouse +Germline variant options: + --input_cpsr INPUT_CPSR + CPSR-classified germline calls (file '.cpsr..classification.tsv.gz') + --input_cpsr_yaml INPUT_CPSR_YAML + CPSR YAML configuration file (file '.cpsr..conf.yaml') + --cpsr_ignore_vus Do not show variants of uncertain significance (VUS) in the germline section of the HTML report (default: False) + Other options: --vcf2maf Generate a MAF file for input VCF using https://github.com/mskcc/vcf2maf (default: False) --vcfanno_n_proc VCFANNO_N_PROC @@ -278,6 +291,7 @@ Other options: --force_overwrite By default, the script will fail with an error if any output file already exists. You can force the overwrite of existing result files by using this flag, default: False --version show program's version number and exit --no_reporting Run functional variant annotation on VCF through VEP/vcfanno, omit other analyses (i.e. Tier assignment/MSI/TMB/Signatures etc. and report generation (STEP 4), default: False + --no_html Do not generate HTML report (default: False) --debug Print full commands to log --pcgrr_conda PCGRR_CONDA pcgrr conda env name (default: pcgrr) diff --git a/pcgrr/vignettes/variant_classification.Rmd b/pcgrr/vignettes/variant_classification.Rmd index 80f4fa85..d180e04a 100644 --- a/pcgrr/vignettes/variant_classification.Rmd +++ b/pcgrr/vignettes/variant_classification.Rmd @@ -7,37 +7,39 @@ output: rmarkdown::html_document PCGR evaluates somatic aberrations (SNV/InDels) for oncogenic potential through an implementation of standard operating procedures proposed by [VICC/CGC/ClinGen](https://pubmed.ncbi.nlm.nih.gov/35101336/). Here, various properties of the variants and genes affected are assigned criteria-specific scores, both negative and positive, pending on whether the properties support an oncogenic or benign variant nature. Criteria-specific scores are in turn aggregated towards an overall oncogenicity score per variant. -Note that all properties/criteria provided in the SOP's are _not_ readily implemented in PCGR, specifically the ones requiring manual curation or expert review (i.e. experimental oncogenic variant evidence, requring support from _in vitro_ or _in vivo_ functional studies (criteria _OM1/OS1_)). This implies that some variants interrogated by PCGR may not be classified as oncogenic, even though they could be classified as such with more functional evidence available. Considering the nature of our current implementation, we have thus also adopted slightly different score thresholds for variant classifications to those proposed originally by [Horak et al., 2022](https://pubmed.ncbi.nlm.nih.gov/35101336/). We are working to further improve the oncogenicity classification in PCGR, and welcome feedback on this matter. +Note that all properties/criteria provided in the SOP's are _not_ readily implemented in PCGR, specifically the ones requiring manual curation or expert review (i.e. experimental oncogenic variant evidence, requiring support from _in vitro_ or _in vivo_ functional studies (criteria _OM1/OS1_)). This implies that some variants interrogated by PCGR may not be classified as oncogenic, even though they could be classified as such with more functional evidence available. Considering the nature of our current implementation, we have thus also adopted slightly different score thresholds for variant classifications to those proposed originally by [Horak et al., 2022](https://pubmed.ncbi.nlm.nih.gov/35101336/). We are working to further improve the oncogenicity classification in PCGR, and welcome feedback on this matter. Note also that for somatic copy number aberrations, we showcase potential oncogenic events as **proto-oncogenes subject to amplifications** (where level of amplification is configurable by the user), as well as**tumor suppressor genes subject to homozygous deletions**. The following criteria are currently used for oncogenicity classification in PCGR: - * _CLINGEN_VICC_SBVS1_ - Very high MAF (> 0.05 in gnomAD - any five major continental pops) - * _CLINGEN_VICC_SBS1_ - High MAF (> 0.01 in gnomAD - any five major continental pops) - * _CLINGEN_VICC_SBP1_ - Insilico support a benign effect on the gene or gene product (multiple lines of evidence (>= 7 algorithms) from dbNSFP) + * _CLINGEN_VICC_SBVS1_ - Very high MAF (> 0.05 in gnomAD - any five major continental populations) + * _CLINGEN_VICC_SBS1_ - High MAF (> 0.01 in gnomAD - any five major continental populations) + * _CLINGEN_VICC_SBP1_ - _Insilico_ support for a benign effect on the gene or gene product (multiple lines of evidence (>= 8 algorithms) from dbNSFP) * _CLINGEN_VICC_SBP2_ - Silent and intronic changes outside of the consensus splice site (VEP consequence) - * _CLINGEN_VICC_OVS1_ - Null variant - in bona fide tumor suppressor gene (predicted as LoF in tumor suppressors from CGC/NCG/CancerMine) + * _CLINGEN_VICC_OVS1_ - Null variant - in a _bona fide_ tumor suppressor gene (predicted as LoF in tumor suppressors from CGC/NCG/CancerMine) + * _CLINGEN_VICC_OS1_ - Same amino acid change as previously established oncogenic variant (ClinVar) * _CLINGEN_VICC_OS3_ - Located in a mutation hotspot, >= 50 samples with a variant at amino acid position, >= 10 samples with same amino acid change (cancerhotspots.org) - * _CLINGEN_VICC_OM1_ - Located in a presumably critical site of functional domain - here, this is implemented through indirect evidence from overlap with known oncogenic and predictive (actionable) biomarkers + * _CLINGEN_VICC_OM1_ - Located in a presumably critical site of functional domain - here, this is implemented through indirect evidence from overlap with known predictive (drug sensitivity/resistance) loci * _CLINGEN_VICC_OM2_ - Protein length changes from in-frame dels/ins in known oncogene/tumor suppressor genes or stop-loss variants in a tumor suppressor gene (tumor suppressors/oncogenes from CGC/NCG/CancerMine) * _CLINGEN_VICC_OM3_ - Located in a mutation hotspot, < 50 samples with a variant at amino acid position, >= 10 samples with same amino acid change (cancerhotspots.org) - * _CLINGEN_VICC_OP1_ - Insilico support a damaging effect on the gene or gene product (multiple lines of evidence (>= 7 algorithms) from dbNSFP) + * _CLINGEN_VICC_OP1_ - _Insilico_ support for a damaging effect on the gene or gene product (multiple lines of evidence (>= 8 algorithms) from dbNSFP) * _CLINGEN_VICC_OP3_ - Located in a mutation hotspot, < 10 samples with the same amino acid change (cancerhotspots.org) * _CLINGEN_VICC_OP4_ - Absent from controls (gnomAD) / very low MAF ( < 0.0001 in all five major subpopulations) ## Actionability -PCGR prioritizes and evaluates variants according to clinical actionability. Currently, PCGR implements its tier classification framework along the proposed AMP/ASCO/CAP guidelines, as detailed below. +PCGR prioritizes and evaluates variants according to clinical actionability. Currently, PCGR implements its tier classification framework along the proposed AMP/ASCO/CAP guidelines, as outlined in [Li et al., 2017](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5707196/): -This tier model attempts to adopt concensus recommendations by AMP/ASCO/CAP, as outlined in [Li et al., 2017](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5707196/): +- **Tier I: Variants of strong clinical significance** - constitutes aberrations linked to predictive, prognostic, or diagnostic biomarkers in the [CIViC database](https://civicdb.org) and the [Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) that are + - Found within the same tumor type/class as specified by the user, **AND** + - Of strong clinical evidence (i.e. approved therapies, part of guidelines, validated or discovered in late clinical trials ([CIViC evidence levels A/B](https://civic.readthedocs.io/en/latest/model/evidence/level.html))) + + - **Tier II: Variants of potential clinical significance** - constitutes other aberrations linked to predictive, prognostic, or diagnostic biomarkers in the [CIViC database](https://civicdb.org) and the [Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) that are either + - Of strong clinical evidence in other tumor types/classes than the one specified by the user, **OR** + - Of weak clinical evidence (early trials, case reports etc. ([CIViC evidence levels C/D/E](https://civic.readthedocs.io/en/latest/model/evidence/level.html)))) in the same tumor type/class as specified by the user + + - **Tier III: Variants of uncertain clinical significance (SNVs/InDels only)** - + - Other coding variants, not observed at significant allele frequencies (gnomAD MAF < 0.001), found in oncogenes or tumor suppressor genes, yet _not_ linked to any known predictive, prognostic, or diagnostic biomarkers in the [CIViC database](https://civicdb.org) and the [Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) - - *Tier I: Variants of strong clinical significance* - constitutes variants linked to predictive, prognostic, or diagnostic evidence items in the [CIViC database](https://civicdb.org) or the [Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) that are - - A: found within the same tumor type/class as specified by the user, AND - - B: of strong clinical evidence (i.e. part of guidelines, validated or discovered in late clinical trials) - - *Tier II: Variants of potential clinical significance* - constitutes other variants linked to predictive, prognostic, or diagnostic evidence items in the [CIViC database](https://civicdb.org) or the [Cancer Biomarkers Database](https://www.cancergenomeinterpreter.org/biomarkers) that are either - - A: of strong clinical evidence in other tumor types/classes than the one specified by the user, OR - - B: of weak clinical evidence (early trials, case reports etc.) in the same tumor type/class as specified by the user - - *Tier III: Variants of uncertain clinical significance* - includes other coding variants found in proto-oncogenes or tumor suppressor genes (SNVs and indels only) - -In PCGR, we skip the classification of variants into the AMP/ASCO/CAP-specified *Tier IV* (benign/likely benign variants), but rather take a more cautious approach. Specifically, for SNVs/indels that do not fall into tier I, II, or III, we classify them into *Tier V: Other coding variants*, which includes other protein-coding variants, as well as *Tier VI: Other non-coding variants*, which includes synonymous variants, intronic variants, and other variants in non-coding regions. +In PCGR, we skip the classification of variants into the AMP/ASCO/CAP-specified *Tier IV* (benign/likely benign variants), but rather take a more cautious approach. Specifically, for SNVs/indels that do not fall into tier I, II, or III, we classify them into *Tier V: Other coding variants*, which includes protein-coding variants in non-cancer related genes, as well as *Tier VI: Other non-coding variants*, which includes synonymous variants, intronic variants, and other variants in non-coding regions. diff --git a/scripts/cpsr.R b/scripts/cpsr.R index aac37195..b9a30a00 100755 --- a/scripts/cpsr.R +++ b/scripts/cpsr.R @@ -39,7 +39,14 @@ if(!is.null(cps_report)){ cpsr::write_cpsr_output( cps_report, output_format = 'xlsx') - cpsr::write_cpsr_output( - cps_report, - output_format = 'html') + + if(cps_report$settings$conf$other$no_html == FALSE){ + cpsr::write_cpsr_output( + cps_report, + output_format = 'html') + } + else{ + pcgrr::log4r_info("Skipping HTML report generation (option '--no_html' set to TRUE)") + } + } diff --git a/scripts/pcgr_summarise.py b/scripts/pcgr_summarise.py index c18ee854..b719ebfc 100755 --- a/scripts/pcgr_summarise.py +++ b/scripts/pcgr_summarise.py @@ -11,7 +11,7 @@ from pcgr.annoutils import read_infotag_file, make_transcript_xref_map, read_genexref_namemap, map_regulatory_variant_annotations, write_pass_vcf from pcgr.vep import parse_vep_csq from pcgr.dbnsfp import vep_dbnsfp_meta_vcf, map_variant_effect_predictors -from pcgr.oncogenicity import assign_oncogenicity_evidence +from pcgr.oncogenicity import assign_oncogenicity_evidence, load_oncogenic_variants, match_oncogenic_variants from pcgr.mutation_hotspot import load_mutation_hotspots, match_csq_mutation_hotspot from pcgr.biomarker import load_biomarkers, match_csq_biomarker from pcgr.utils import error_message, check_subprocess, getlogger @@ -27,6 +27,7 @@ def __main__(): parser.add_argument('regulatory_annotation',default=0,type=int,help='Inclusion of VEP regulatory annotations (0/1)') parser.add_argument('oncogenicity_annotation',default=0,type=int,help='Include oncogenicity annotation (0/1)') parser.add_argument('tumortype', default='Any', help='Primary tumor type of query VCF') + parser.add_argument('build', default='GRCh38', help='Genome build of query VCF') parser.add_argument('vep_pick_order', default="mane,canonical,appris,biotype,ccds,rank,tsl,length", help=f"Comma-separated string of ordered transcript/variant properties for selection of primary variant consequence") parser.add_argument('refdata_assembly_dir',help='Assembly-specific reference data directory, e.g. "pcgrdb/data/grch38') @@ -95,8 +96,11 @@ def extend_vcf_annotations(arg_dict, logger): gene_transcript_xref_map = read_genexref_namemap( os.path.join(arg_dict['refdata_assembly_dir'], 'gene','tsv','gene_transcript_xref', 'gene_transcript_xref_bedmap.tsv.gz'), logger) + #print(gene_transcript_xref_map) cancer_hotspots = load_mutation_hotspots( os.path.join(arg_dict['refdata_assembly_dir'], 'misc','tsv','hotspot', 'hotspot.tsv.gz'), logger) + oncogenic_variants = {} + oncogenic_variants = load_oncogenic_variants(os.path.join(arg_dict['refdata_assembly_dir'], 'variant','tsv', 'clinvar','clinvar_oncogenic.tsv.gz'), logger) biomarkers = {} for db in ['cgi','civic']: @@ -183,7 +187,8 @@ def extend_vcf_annotations(arg_dict, logger): if arg_dict['cpsr'] is True: vep_csq_record_results = \ parse_vep_csq(rec, transcript_xref_map, vep_csq_fields_map, arg_dict['vep_pick_order'], - logger, pick_only = False, csq_identifier = 'CSQ', + logger, pick_only = False, csq_identifier = 'CSQ', + debug = arg_dict['debug'], targets_entrez_gene = cpsr_target_genes) if 'picked_gene_csq' in vep_csq_record_results: rec.INFO['REGULATORY_ANNOTATION'] = map_regulatory_variant_annotations( @@ -191,7 +196,7 @@ def extend_vcf_annotations(arg_dict, logger): else: vep_csq_record_results = \ parse_vep_csq(rec, transcript_xref_map, vep_csq_fields_map, arg_dict['vep_pick_order'], - logger, pick_only = True, csq_identifier = 'CSQ') + logger, pick_only = True, csq_identifier = 'CSQ', debug = arg_dict['debug']) principal_csq_properties = {} principal_csq_properties['hgvsp'] = '.' @@ -242,7 +247,8 @@ def extend_vcf_annotations(arg_dict, logger): map_variant_effect_predictors(rec, dbnsfp_prediction_algorithms) if arg_dict['oncogenicity_annotation'] == 1: - assign_oncogenicity_evidence(rec, tumortype = arg_dict['tumortype']) + match_oncogenic_variants(vep_csq_record_results['all_csq'], oncogenic_variants, rec, principal_csq_properties) + assign_oncogenicity_evidence(rec, oncogenic_variants, tumortype = arg_dict['tumortype']) if "GENE_TRANSCRIPT_XREF" in vcf_info_element_types: gene_xref_tag = rec.INFO.get('GENE_TRANSCRIPT_XREF') diff --git a/scripts/pcgr_validate_input.py b/scripts/pcgr_validate_input.py index 1d2e56e6..0e680b21 100755 --- a/scripts/pcgr_validate_input.py +++ b/scripts/pcgr_validate_input.py @@ -8,12 +8,14 @@ import sys import pandas as np from cyvcf2 import VCF +import gzip from pcgr import vcf, cna from pcgr.annoutils import read_infotag_file, read_vcfanno_tag_file from pcgr.utils import error_message, check_subprocess, remove_file, random_id_generator, getlogger from pcgr.cna import is_valid_cna from pcgr.vcf import check_existing_vcf_info_tags, check_retained_vcf_info_tags +from pcgr import pcgr_vars def __main__(): @@ -25,9 +27,11 @@ def __main__(): parser.add_argument('input_cna', help='Somatic (tumor) copy number query segments (tab-separated values)') parser.add_argument('input_rna_fusion', help='Tumor RNA fusion variants (tab-separated values)') parser.add_argument('input_rna_exp', help='Tumor gene expression estimates (tab-separated values)') + parser.add_argument('input_cpsr', help='Classified germline calls from CPSR (tab-separated values)') parser.add_argument('panel_normal_vcf',help="VCF file with germline calls from panel of normals") parser.add_argument('tumor_only',type=int, default=0,choices=[0,1],help="Tumor only sequencing") parser.add_argument('sample_id',help='PCGR sample_name') + parser.add_argument('build',help='Genome build (grch37/grch38)') parser.add_argument('retained_info_tags', help="Comma-separated string of custom VCF INFO tags to be kept in PCGR output") parser.add_argument('tumor_dp_tag', help='VCF INFO tag that denotes tumor sequencing depth') parser.add_argument('tumor_af_tag', help='VCF INFO tag that denotes tumor variant allelic fraction') @@ -47,6 +51,7 @@ def __main__(): args.input_cna, args.input_rna_fusion, args.input_rna_exp, + args.input_cpsr, args.tumor_dp_tag, args.tumor_af_tag, args.control_dp_tag, @@ -58,6 +63,7 @@ def __main__(): args.retained_info_tags, args.tumor_only, args.sample_id, + args.build, args.keep_uncompressed, args.output_dir, args.debug) @@ -133,6 +139,30 @@ def is_valid_rna_expression(rna_exp_file, logger): logger.info("RNA expression file ('" + str(os.path.basename(rna_exp_file)) + "') adheres to the correct format") return 0 +def is_valid_germline(germline_file, build, logger): + """ + Function that checks whether the germline variants file (pre-classified by CPSR) adheres to the correct format + """ + + if not os.path.isfile(germline_file): + err_msg = "Germline variants file (" + str(germline_file) + ") does not exist" + return error_message(err_msg, logger) + + if not str(germline_file).endswith(f'cpsr.{build}.classification.tsv.gz'): + err_msg = "Germline variants file (" + str(germline_file) + ") does not adhere to the correct naming format - wrong build or file type" + return error_message(err_msg, logger) + + with gzip.open(germline_file, 'rt') as f: + germline_reader = csv.DictReader(f, delimiter='\t') + ## check that required columns are present + for col in pcgr_vars.germline_input_required_cols: + if col not in germline_reader.fieldnames: + err_msg = "Germline variants file (" + str(germline_file) + ") is missing required column: " + str(col) + return error_message(err_msg, logger) + + logger.info("Germline variants file ('" + str(os.path.basename(germline_file)) + "') adheres to the correct format") + return 0 + def validate_panel_normal_vcf(vcf, logger): """ @@ -260,6 +290,7 @@ def validate_pcgr_input(refdata_assembly_dir, input_cna, input_rna_fusion, input_rna_expression, + input_cpsr, tumor_dp_tag, tumor_af_tag, control_dp_tag, @@ -271,11 +302,18 @@ def validate_pcgr_input(refdata_assembly_dir, retained_info_tags, tumor_only, sample_id, + build, keep_uncompressed, output_dir, debug): """ - Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks: + Function that checks the format of input files to PCGR + - VCF file with somatic SNVs/InDels - mandatory + - Tab-separated values file with somatic copy number segments - optional + - Tab-separated values file with RNA fusion variants - optional + - Tab-separated values file with RNA expression values - optional + - Tab-separated values file with CPSR-classified germline mutations - optional + Function performs the following checks: 1. No INFO annotation tags in the input VCF coincides with those generated by PCGR 2. Provided columns for tumor/normal coverage and allelic depths are found in VCF 3. Provided retained VCF INFO tags are present in VCF file @@ -285,6 +323,7 @@ def validate_pcgr_input(refdata_assembly_dir, 7. Check that copy number segment file has required columns and correct data types (and range) 8. Check that RNA fusion variant file has required columns and correct data types 9. Check that RNA expression file has required columns and correct data types + 10. Check that germline mutation file has required columns and correct data types """ logger = getlogger('pcgr-validate-input-arguments') @@ -356,6 +395,12 @@ def validate_pcgr_input(refdata_assembly_dir, valid_cna = is_valid_cna(input_cna, logger) if valid_cna == -1: return -1 + + ## Check whether file with classified germline calls is properly formatted + if not input_cpsr == 'None': + valid_germline = is_valid_germline(input_cpsr, build, logger) + if valid_germline == -1: + return -1 ## Check whether file with RNA fusion variants is properly formatted if not input_rna_fusion == 'None': diff --git a/scripts/pcgrr.R b/scripts/pcgrr.R index 25e28f61..8b41fd57 100755 --- a/scripts/pcgrr.R +++ b/scripts/pcgrr.R @@ -32,13 +32,18 @@ pcg_report <- pcgrr::generate_report( yaml_fname = yaml_fname ) -#pcg_report$settings$conf$debug <- TRUE - ## Write report contents to output files (HTML, XLSX, TSV) if (!is.null(pcg_report)) { - pcgrr::write_report_quarto_html(report = pcg_report) + if(pcg_report$settings$conf$other$no_html == FALSE){ + pcgrr::write_report_quarto_html(report = pcg_report) + } + else{ + pcgrr::log4r_info("Skipping HTML report generation (option '--no_html' set to TRUE)") + } + pcgrr::write_report_excel(report = pcg_report) pcgrr::write_report_tsv(report = pcg_report, output_type = 'snv_indel') + pcgrr::write_report_tsv(report = pcg_report, output_type = 'snv_indel_unfiltered') pcgrr::write_report_tsv(report = pcg_report, output_type = 'cna_gene') pcgrr::write_report_tsv(report = pcg_report, output_type = 'msigs') }