Merge pull request #251 from sigven/cnaplot_patch

Sept 2024 release
sigven · Sep 29, 2024 · 6b39e4b · 6b39e4b
2 parents c3eca60 + 0cb0d76
commit 6b39e4b
Show file tree

Hide file tree

Showing 71 changed files with 2,684 additions and 981 deletions.
diff --git a/README.md b/README.md
@@ -24,24 +24,29 @@ Example screenshots from the [quarto](https://quarto.org)-based cancer genome re
 ![PCGR screenshot 2](pcgrr/pkgdown/assets/img/sc1.png)
 ![PCGR screenshot 3](pcgrr/pkgdown/assets/img/sc3.png)
 
-PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](http://cancergenomics.no), at the [Institute for Cancer Research, Oslo University Hospital, Norway](http://radium.no).
+PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](https://cancergenomics.no), at the [Institute for Cancer Research, Oslo University Hospital, Norway](https://radium.no).
 
 ### Top News
 
+- *September 29th 2024*: **2.1.0 release**
+  - updated bundle, more oncogenic variants, CNA visualization, 
+    improved RNA-seq support, bug fixes, and more
+  - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html)
+
 - *August 1st 2024*: **2.0.3 release** 
   - patch to fix purity/ploidy propagation, MAF output for tumor-only runs, and other minor issues
-  - [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html)
+  - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html)
 
 - *July 16th 2024*: **2.0.2 release** 
   - patch to ensure correct reference to actionability guidelines
-  - [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html)
+  - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html)
 
 - *July 7th 2024*: **2.0.1 release** 
   - patch with bug fix for mitochondrial input variants ([pr245](https://github.com/sigven/pcgr/pull/245))
-  - [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html)
+  - [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html)
 
 - *June 2024*: **2.0.0 release**
-  - Details in [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html)
+  - Details in [CHANGELOG](https://sigven.github.io/pcgr/articles/CHANGELOG.html)
   - Massive reference data bundle upgrade, new report layout, oncogenicity classification++
   - Support for Singularity/Apptainer
   - Major data/software updates:
@@ -52,19 +57,9 @@ PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](http://ca
     - CancerMine `v50` (2023-03)
     - UniProt KB `v2024_03`
 
-- *February 2023*: **1.3.0 release**
-  - Details in [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html)
-  - prioritize protein-coding BIOTYPE csq ([pr201](https://github.com/sigven/pcgr/pull/201))
-  - expose `--pcgrr_conda` option to flexibly activate pcgrr env via a non-default pcgrr name
-  - `cpsr_validate_input.py`: refactor for efficient custom gene egrep
-
-- *November 2022*: **1.2.0 release**
-  -    Keep only autosomal, X, Y, M/MT chromosomes
-  -    Import bcftools as dependency
-
 ### Example reports
 
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12752833.svg)](https://doi.org/10.5281/zenodo.12752833)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.13855988.svg)](https://doi.org/10.5281/zenodo.13855988)
 
 ### Why use PCGR?
 
@@ -94,10 +89,12 @@ PCGR integrates a [comprehensive set of knowledge resources](https://sigven.gith
 
 ### Citation
 
-If you use PCGR, please cite our publication:
+If you use PCGR or CPSR, please cite our publications:
 
 Sigve Nakken, Ghislain Fournous, Daniel Vodák, Lars Birger Aaasheim, Ola Myklebost, and Eivind Hovig. **Personal Cancer Genome Reporter: variant interpretation report for precision oncology** (2017). *Bioinformatics*. 34(10):1778--1780. [doi.org/10.1093/bioinformatics/btx817](https://doi.org/10.1093/bioinformatics/btx817)
 
+Sigve Nakken, Vladislav Saveliev, Oliver Hofmann, Pål Møller, Ola Myklebost, and Eivind Hovig. **Cancer Predisposition Sequencing Reporter (CPSR): a flexible variant report engine for high-throughput germline screening in cancer** (2021). *Int J Cancer*. [doi:[10.1002/ijc.33749](doi:%5B10.1002/ijc.33749)](https://doi.org/10.1002/ijc.33749)
+
 ## Contact
 
 sigven AT ifi.uio.no
diff --git a/pcgr/annoutils.py b/pcgr/annoutils.py
diff --git a/pcgr/arg_checker.py b/pcgr/arg_checker.py
@@ -181,8 +181,8 @@ def verify_args(arg_dict):
         warn_message(warn_msg, logger)
 
     # Check that threshold for gains/amplifications are properly set, and that segment overlap with transcripts are set appropriately
-    if arg_dict['n_copy_gain'] <= 0:
-        err_msg = f"Totaly copy number threshold for gains/amplifications ('--n_copy_gain' = {arg_dict['n_copy_gain']}) should be > 0"
+    if arg_dict['n_copy_gain'] <= 2:
+        err_msg = f"Total copy number threshold for gains/amplifications ('--n_copy_gain' = {arg_dict['n_copy_gain']}) should be > 2"
         error_message(err_msg, logger)
     if arg_dict['cna_overlap_pct'] > 100 or arg_dict['cna_overlap_pct'] <= 0:
         err_msg = f"Minimum percent overlap between copy number segment and gene transcript ('--cna_overlap_pct' = {arg_dict['cna_overlap_pct']}) must be within (0, 100]"
@@ -220,15 +220,11 @@ def define_output_files(arg_dict, cpsr = False):
     output_data['yaml']= f"{output_prefix}.conf.yaml"
 
     if not cpsr:
-        output_data['cna'] = f"{output_prefix}.cna_segments.tsv.gz"
-        output_data['expression'] = f"{output_prefix}.expression.tsv.gz"
-        output_data['csq_expression'] = f"{output_prefix}.csq_expression.tsv.gz"
-        output_data['expression_outliers'] = f"{output_prefix}.expression_outliers.tsv.gz"
-        output_data['expression_similarity'] = f"{output_prefix}.expression_similarity.tsv.gz"
-        output_data['snv_indel_ann'] = f"{output_prefix}.snv_indel_ann.tsv.gz"
+        for otype in ['cna_gene','cna_segment','expression','expression_outliers',
+                      'expression_similarity','snv_indel_ann','msigs']:
+            output_data[otype] = f"{output_prefix}.{otype}.tsv.gz"        
         output_data['maf'] = f"{output_prefix}.maf"
         output_data['tmb'] = f"{output_prefix}.tmb.tsv"
-        output_data['msigs'] = f"{output_prefix}.msigs.tsv.gz"
     else:
         output_data['classification'] = f"{output_prefix}.classification.tsv.gz"
 
@@ -240,7 +236,7 @@ def define_output_files(arg_dict, cpsr = False):
             error_message(err_msg, logger)
 
     if not cpsr:
-        for otype in ['cna', 'expression', 'expression_outliers', 'snv_indel_ann',
+        for otype in ['cna_gene', 'cna_segment','expression', 'expression_outliers', 'snv_indel_ann',
                       'expression_similarity','maf','tmb','msigs']:
             # if annotated output cna segments exist and overwrite not set
             if os.path.exists(output_data[otype]) and arg_dict["force_overwrite"] is False:
@@ -267,6 +263,7 @@ def verify_input_files(arg_dict):
     input_cna_dir = 'NA'
     input_rna_fusion_dir = 'NA'
     input_germline_dir = 'NA'
+    input_germline_yaml_dir = 'NA'
     input_rna_expression_dir = 'NA'
     pon_vcf_dir = 'NA'
     db_dir = 'NA'
@@ -277,6 +274,7 @@ def verify_input_files(arg_dict):
     input_rna_fusion_basename = 'NA'
     input_rna_expression_basename = 'NA'
     input_germline_basename = 'NA'
+    input_germline_yaml_basename = 'NA'
     arg_dict['rna_fusion_tumor'] = None
 
     # create output folder (if not already exists)
@@ -366,19 +364,30 @@ def verify_input_files(arg_dict):
             os.path.abspath(arg_dict["input_rna_exp"]))
 
     # check if input germline calls (CPSR) exist
-    #if not arg_dict["input_germline"] is None:
-    #    if not os.path.exists(os.path.abspath(arg_dict["input_germline"])):
-    #        err_msg = "Input file (" + \
-    #            str(arg_dict["input_germline"]) + ") does not exist"
-    #        error_message(err_msg, logger)
-    #    if not (os.path.abspath(arg_dict["input_germline"]).endswith(".tsv.gz")):
-    #        err_msg = "File with CPSR-classified germline calls  (" + os.path.abspath(
-    #            arg_dict["input_germline"]) + ") does not have the correct file extension (.json.gz)"
-    #        error_message(err_msg, logger)
-    #    input_germline_basename = os.path.basename(
-    #        str(arg_dict["input_germline"]))
-    #    input_germline_dir = os.path.dirname(
-    #        os.path.abspath(arg_dict["input_germline"]))
+    if not arg_dict["input_cpsr"] is None:
+        if not os.path.exists(os.path.abspath(arg_dict["input_cpsr"])):
+            err_msg = "Input file (" + \
+               str(arg_dict["input_cpsr"]) + ") does not exist"
+            error_message(err_msg, logger)
+        if not (os.path.abspath(arg_dict["input_cpsr"]).endswith(".tsv.gz")):
+            err_msg = "File with CPSR-classified germline calls  (" + os.path.abspath(
+               arg_dict["input_cpsr"]) + ") does not have the correct file extension (.tsv.gz)"
+            error_message(err_msg, logger)
+
+        if arg_dict["input_cpsr_yaml"] is None:
+            err_msg = "Input file with CPSR configuration settings (--input_cpsr_yaml) is missing"
+            error_message(err_msg, logger)
+        else:
+            check_file_exists(os.path.abspath(arg_dict["input_cpsr_yaml"]), strict = True, logger = logger)
+            input_germline_yaml_basename = os.path.basename(
+                str(arg_dict["input_cpsr_yaml"]))
+            input_germline_yaml_dir = os.path.dirname(
+                os.path.abspath(arg_dict["input_cpsr_yaml"]))
+
+        input_germline_basename = os.path.basename(
+            str(arg_dict["input_cpsr"]))
+        input_germline_dir = os.path.dirname(
+            os.path.abspath(arg_dict["input_cpsr"]))   
 
     vep_dir = verify_vep_cache(arg_dict, logger)
     refdata_assembly_dir = verify_refdata(arg_dict, logger, cpsr = False)
@@ -391,6 +400,8 @@ def verify_input_files(arg_dict):
       "rna_expression_dir": input_rna_expression_dir,
       "germline_dir": input_germline_dir,
       "germline_basename": input_germline_basename,
+      "germline_yaml_dir": input_germline_yaml_dir,
+      "germline_yaml_basename": input_germline_yaml_basename,
       "pon_vcf_dir": pon_vcf_dir,
       "refdata_assembly_dir": refdata_assembly_dir,
       "vep_dir": vep_dir,

diff --git a/pcgr/biomarker.py b/pcgr/biomarker.py
@@ -184,10 +184,12 @@ def match_csq_biomarker(transcript_csq_elements, variant_biomarkers, rec, princi
    principal_csq_hgvsc = False
    for csq_elem in transcript_csq_elements:
       (consequence, symbol, entrezgene, hgvsc, hgvsp, exon, feature_type, feature, biotype) = csq_elem.split(':')
-      #print(csq_elem)
 
-      if bool(re.search(r'^(missense|stop|start|inframe|splice_donor|protein|splice_acceptor|frameshift)', consequence)) is True:
+      #if bool(re.search(r'^(missense|stop|start|inframe|protein|splice_donor|splice_acceptor|frameshift)', consequence)) is True:
+      #   mut_protein = True
+      if bool(re.search(r'^(missense|stop|start|inframe|protein|frameshift)', consequence)) is True:
          mut_protein = True
+
 
       hgvsp_short = threeToOneAA(hgvsp)
 
@@ -228,12 +230,10 @@ def match_csq_biomarker(transcript_csq_elements, variant_biomarkers, rec, princi
 
          if len(codon_match) > 0:
             biomarker_key_codon = str(entrezgene) + '_' + str(codon_match[0])
-            #print("CODON\t" + str(biomarker_key_codon))
 
             ## match biomarkers annotated as "CODON" only for a given gene
             if biomarker_key_codon in variant_biomarkers['hgvsp']:
                hits_codon = variant_biomarkers['hgvsp'][biomarker_key_codon]
-               #print("CODON\t" + str(hits_codon))
                for chit in hits_codon:
                   if not chit['alteration_type'] == "CODON":
                      continue
@@ -273,8 +273,8 @@ def match_csq_biomarker(transcript_csq_elements, variant_biomarkers, rec, princi
       if entrezgene != "." and not rec.INFO.get('HGVSc') is None:
          hgvsc_elements = str(rec.INFO.get('HGVSc')).split(':')
          if len(hgvsc_elements) == 2:
-            hgvsc_biomarker_key = str(entrezgene) + '_' + str(hgvsc_elements[1])              
-            if hgvsc_biomarker_key in variant_biomarkers['hgvsc'].keys():                  
+            hgvsc_biomarker_key = str(entrezgene) + '_' + str(hgvsc_elements[1]) 
+            if hgvsc_biomarker_key in variant_biomarkers['hgvsc'].keys():
                hits_hgvsc = variant_biomarkers['hgvsc'][hgvsc_biomarker_key]
                for hit_hgvsc in hits_hgvsc:
                   hgvsc_hit = f"{hit_hgvsc['biomarker_source']}|{hit_hgvsc['variant_id']}|{hit_hgvsc['clinical_evidence_items']}"
@@ -289,7 +289,6 @@ def match_csq_biomarker(transcript_csq_elements, variant_biomarkers, rec, princi
       ## Match biomarkers indicated by exon number (and consequence) - "exon level" resolution
       if entrezgene != "." and principal_csq_entrezgene is True and exon != ".":
          exon_biomarker_key = str(entrezgene) + '_' + str(exon)
-         #print("EXON\t" + str(exon_biomarker_key))
          if exon_biomarker_key in variant_biomarkers['exon'].keys():
             hits_exon = variant_biomarkers['exon'][exon_biomarker_key]