From b60ea96757f8da1dde3b3d12671c79fd13a665d1 Mon Sep 17 00:00:00 2001 From: Sigve Nakken Date: Sun, 17 Dec 2023 14:59:12 +0100 Subject: [PATCH] fix panel match and gnomad col --- pcgr/cpsr.py | 9 +++++---- pcgr/main.py | 3 ++- pcgr/variant.py | 10 +++++++--- pcgrr/data-raw/data-raw.R | 4 ++-- pcgrr/data/data_coltype_defs.rda | Bin 1916 -> 1898 bytes scripts/cpsr_validate_input.py | 2 +- 6 files changed, 17 insertions(+), 11 deletions(-) diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py index 4531f85c..aded3dc1 100755 --- a/pcgr/cpsr.py +++ b/pcgr/cpsr.py @@ -44,9 +44,9 @@ def get_args(): optional_other.add_argument('--version', action='version', version=str(utils.get_cpsr_version())) optional_other.add_argument('--no_reporting',action="store_true",help="Run functional variant annotation on VCF through VEP/vcfanno, omit classification/report generation (STEP 4), default: %(default)s") optional_other.add_argument('--retained_info_tags', dest ='retained_info_tags', default='None', help='Comma-separated string of VCF INFO tags from query VCF that should be kept in CPSR output TSV') - #optional_other.add_argument('--report_theme',choices = ['default','cerulean','journal','flatly','readable','spacelab','united','cosmo','lumen','paper','sandstone','simplex','yeti'], default = 'default', help='Visual report theme (rmarkdown), default: %(default)s' ) - #optional_other.add_argument('--report_nonfloating_toc', action='store_true', help='Do not float the table of contents (TOC) in output HTML report, default: %(default)s') - #optional_other.add_argument('--report_table_display', choices = ['full','light'], default='light', help="Set the level of detail/comprehensiveness in interactive datables of HTML report, very comprehensive (option 'full') or slim/focused ('light'), default: %(default)s") + optional_other.add_argument('--report_theme',choices = ['default','cerulean','journal','flatly','readable','spacelab','united','cosmo','lumen','paper','sandstone','simplex','yeti'], default = 'default', help='Visual report theme (rmarkdown), default: %(default)s' ) + optional_other.add_argument('--report_nonfloating_toc', action='store_true', help='Do not float the table of contents (TOC) in output HTML report, default: %(default)s') + optional_other.add_argument('--report_table_display', choices = ['full','light'], default='light', help="Set the level of detail/comprehensiveness in interactive datables of HTML report, very comprehensive (option 'full') or slim/focused ('light'), default: %(default)s") optional_other.add_argument('--ignore_noncoding', action='store_true',dest='ignore_noncoding',default=False,help='Ignore non-coding (i.e. non protein-altering) variants in report, default: %(default)s') optional_other.add_argument("--debug", action="store_true", help="Print full commands to log") optional_other.add_argument("--pcgrr_conda", default="pcgrr", help="pcgrr conda env name (default: %(default)s)") @@ -282,7 +282,8 @@ def run_cpsr(conf_options, cpsr_paths): output_pass_vcf2tsv_gz, pcgr_db_dir = cpsr_paths["db_dir"], logger = logger) variant_set = variant.clean_annotations(variant_set, yaml_data, germline = True, logger = logger) variant_set.to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False) - utils.remove(output_pass_vcf2tsv_gz) + if not debug: + utils.remove(output_pass_vcf2tsv_gz) logger.info('Finished cpsr-summarise') diff --git a/pcgr/main.py b/pcgr/main.py index a65bcc55..3a0ec50d 100755 --- a/pcgr/main.py +++ b/pcgr/main.py @@ -452,7 +452,8 @@ def run_pcgr(pcgr_paths, conf_options): variant_set = variant.set_allelic_support(variant_set, allelic_support_tags = yaml_data["conf"]['somatic_snv']['allelic_support']) variant_set = variant.clean_annotations(variant_set, yaml_data, germline = False, logger = logger) variant_set.to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False) - utils.remove(output_pass_vcf2tsv_gz) + if not debug: + utils.remove(output_pass_vcf2tsv_gz) if yaml_data["conf"]['assay_properties']['type'] == 'WGS' or yaml_data["conf"]['assay_properties']['type'] == 'WES': # check that output file exist diff --git a/pcgr/variant.py b/pcgr/variant.py index bbd9c64e..aeb5ccc7 100644 --- a/pcgr/variant.py +++ b/pcgr/variant.py @@ -81,6 +81,7 @@ def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger): ## check number of variants with Ensembl gene ID's num_recs_with_entrez_hits = vcf2tsv_df["ENTREZGENE"].notna().sum() + #print(str(num_recs_with_entrez_hits)) ## merge variant set with ClinVar trait and variant origin annotations if num_recs_with_clinvar_hits > 0: if os.path.exists(clinvar_tsv_fname): @@ -133,7 +134,6 @@ def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger): usecols=["entrezgene","name"]) gene_xref_df = gene_xref_df[gene_xref_df['entrezgene'].notnull()].drop_duplicates() gene_xref_df["entrezgene"] = gene_xref_df["entrezgene"].astype("int64").astype("string") - #print(gene_xref_df.head) gene_xref_df.rename(columns = {'entrezgene':'ENTREZGENE', 'name':'GENENAME'}, inplace = True) vcf2tsv_df = vcf2tsv_df.merge(gene_xref_df, left_on=["ENTREZGENE"], right_on=["ENTREZGENE"], how="left") vcf2tsv_df["ENTREZGENE"] = vcf2tsv_df['ENTREZGENE'].str.replace("\\.[0-9]{1,}$", "", regex = True) @@ -253,8 +253,12 @@ def clean_annotations(variant_set: pd.DataFrame, yaml_data: dict, germline: bool variant_set['EFFECT_PREDICTIONS'] = variant_set['EFFECT_PREDICTIONS'].str.replace("\\.&|\\.$", "NA&", regex = True) variant_set['EFFECT_PREDICTIONS'] = variant_set['EFFECT_PREDICTIONS'].str.replace("&$", "", regex = True) variant_set['EFFECT_PREDICTIONS'] = variant_set['EFFECT_PREDICTIONS'].str.replace("&", ", ", regex = True) - variant_set.loc[variant_set['CLINVAR_CONFLICTED'] == 1, "CLINVAR_CONFLICTED"] = True - variant_set.loc[variant_set['CLINVAR_CONFLICTED'] != 1, "CLINVAR_CONFLICTED"] = False + variant_set['clinvar_conflicted_bool'] = True + variant_set.loc[variant_set['CLINVAR_CONFLICTED'] == 1, "clinvar_conflicted_bool"] = True + variant_set.loc[variant_set['CLINVAR_CONFLICTED'] != 1, "clinvar_conflicted_bool"] = False + variant_set.drop('CLINVAR_CONFLICTED', inplace=True, axis=1) + variant_set.rename(columns = {'clinvar_conflicted_bool':'CLINVAR_CONFLICTED'}, inplace = True) + if not {'VCF_SAMPLE_ID'}.issubset(variant_set.columns): variant_set['VCF_SAMPLE_ID'] = str(yaml_data['sample_id']) diff --git a/pcgrr/data-raw/data-raw.R b/pcgrr/data-raw/data-raw.R index 04d53667..4faf189b 100755 --- a/pcgrr/data-raw/data-raw.R +++ b/pcgrr/data-raw/data-raw.R @@ -64,11 +64,11 @@ usethis::use_data(color_palette, overwrite = T) #-----evidence types---------# evidence_types <- c("predictive","prognostic","diagnostic", "oncogenic","predisposing","functional") -usethis::use_data(evidence_types) +usethis::use_data(evidence_types, overwrite = T) #-----evidence levels---------# evidence_levels <- c("any","A_B","C_D_E") -usethis::use_data(evidence_levels) +usethis::use_data(evidence_levels, overwrite = T) #-----input column names/types-----# diff --git a/pcgrr/data/data_coltype_defs.rda b/pcgrr/data/data_coltype_defs.rda index 2e7197c1b53eeb7b131fa8c7d738f6c440e130f1..fd1eff8b5cdb8c4681d687993819e471f4645b6a 100644 GIT binary patch delta 1891 zcmV-p2b}o)4(bknLRx4!F+o`-Q&~uP-hTieSpT5^|Npx{01yB_|Nrp6@86&h0005t z34eO~+~#KJb?5*900uzi;RYI>r5>OF9-tZk00000002rxlO~fUi1iu(000Km(dd8x z05}E_gwqhfAQ=EKjWT4yFc8o}l_{WUqfG&z00006fB5}L4A?jBz;btZX?9PwNC)mk zh$12>X<(?Br!0zAvceQ8T$NO{Nfk&sVa;jNl>X<}!tK|ym2`7-;Jbrqy6&{$x|(dq zYnIZ=`rhoVG*tG^;LC6DspoV?=Tl{&&wbHoVI>HEh!ntkoFV~vq@ZB6((1%G)c?InV0x~LsaiPFe z2}MEC0g@n~TB3pqks%MYB=4yp84(5oDx%9SSrcw4Q0UAiH^mqzN&pq?)~8TB)#7NX zS=ms3<<3r#l5;p(88tn&YI;WLn>D1Llhqged$AhLJ!Rx9m&3Y@smr>hO0Tu8nsifk ziltD|os*~BTR+Q>l>V2)A@+LYmXzf5GKCsy?EOn%$?$0QGKqMd}p&_Jb z8lb(>nHtp%YJ{~(4(2bfhh`Q}9~(V~vw}qQ`7rjr`{F9>S@Kg%n8|DT-$?fRw@d8# znHrwuM|n%7mTkJ@m8ZpuZp4PvKIya5I(N>eT@Cr&&1FZ8-7VXmHJL2j{)6%1$5$?Y zE-t?}#xXp3(ZcRco-|V`jOm3;-nW*uB5v;~z|k1)-TLlpPr)?abIEDOC(!sej`nb}WL{5qUiiB+yznh4?0A=Kj!H^Y zGtuV6Sf>Zzx822#E-IggFH`8}@yt7aqscd0@SNQtEcADs1AVIDZGtZxLMixpL+8%KHOYR)XJO6OG3L=rKPU4qZcNBY)n%M zODQQ*l`>4F#qh)_h2=6b#YJmoMK7B+jJht)NpWW+o$MPj%YkWSm|j&qjhG`PO2UPT zehW)3_IXY^7`ZTWjXNRJzhDf8Xk9&qKcN zcYEBFy2edB*I}}1nI444CQQ_SrmgRF<<_waa%^^kL3S?B(yUTq?Mfh_ww#68U3|xLtnI$}U2{#6W1I2}lY?N2``c$UXxS z23XY?5FsPwm30M#qY^fZs20eR?FAlx{x+^DpDN1gSh`%Y%_++kGE%#-jyPSiu-(g= zjNLis%X{k0XD2QjDqFe7X6!HK)o&S`+^uzM#kT65w!>Aa(Y@T<7h(FHmzqpDx^$&B z-)yk0ANWI@2>)J|Pp|MVr-atV3KoZE_6bK+4FN=?)gmXa#f4XCnl(Kjy=tI{;GPJq^2Ov2>q?Y86 zx@ZimGXTg*3~`X5i5P~3r6Ly~Y{63?$gs*&m6(H{xK0NOOa$+H}qX|}P+ zwNM1yHH~2%cYsha{3cV$6S>`PICV`>L5bHS5O74P@RjeBB(x468<5-8f*HjAlUedVj+2}ac0vk*4sausz1X+YoXS4FyGFh2rMeP2z>6sa d7u?L^C=hdQilAXoK%9TY+>uTcBoZEXpTK?KrEUNK literal 1916 zcmV-?2ZQ)RT4*^jL0KkKS)t^Q`~V*W|DgZ>|Gz*05CA{_|M9=?-=H7>00H0$e|=Aw zQ$Xkd01kkC1I5O{4O8_f^#BHsPyhe`000000mh7(7)Br%O$Grp8U{?7GGs7-!&IBp z%}8hf00006fB*mh0!=DOG-3>xn1ItD!88U#O$H%|4^RlAl=7q0+d_|129r%RXa<@VrLhA;%_`1>kyRw0mw#?yzucHNTv!(1!!#oAhCsqL8C_E#P_)o*8K;jrXwZ-gWM>>g-`+V$E}AD zYLVh-s#)5oa_1*TNzCDBWYqTAspL0I*{vk}-nhTp*@)I@=3YX{d^;%0oVzMis{Ji& z)1sTSRVs#z?43S>+5UWeQ~KWwhv@T?T2qs~lql0zYx@CPHk7RmtzjOK7}bl2w+JGAOb?c z1!M+L*V6gkghkYt3QP4SFmowSRJg*)!H6pxvZ%1yQRB%<5`l;&$RxI z9CdQy;_LLjF^S{Oju&!m<4R>woiM4J&Fa>qP2I&f8YUUJ7s^U`^htJUoM(0NT}{?q zoU-W9WuNYro|5v=_ZP8ucDtI>>rFSD@>+3;^F9sZxtuJS7n9xBx-QJ_JPS&D9wpmJ z$w^9vdOaA66yW;S`Z%%0#Z&9$>wMh)IfrC=lXbqi&C(Li{^Ou;x2`odb~TzWgw{&D z%FG_aSHj%1;Ga*g*3H$~>1SIe&c@8>E1Ym${7bT_8aQ@MrY^KyyPau9_gWSzb+mhK zwxi`XlhDRd-nSKO>ENb*_sz;~HEQ8!Yxy5i^D{o)E+=g=rt*@|uG-YJwbqnkedsS4bs_|@$2Mr|8`&n% z>fM7poG`nitePzOJ91xzyp-KMoFwa`BwlM_`lZH-cst$~#W=A3RLi8=soGlI*t-hs zTP#DH4hC0nMboqBdO0ShiJvy?n2+PDJ615eMM|nu%|qeL!tEXoR!`CJexEK9dVFWb zd?!@u=T0jn{c2wl#|hivdnH=yms;JdUiD|*W5I^LD?J>ask>yX&tjdZ^lbR6boa_d z+U2{d`Y`7vc5?UFba(WfiGK2`Axq;b!kN5HB3u*#Fu*XVAYpJ|gH$AzBqaeRsiV}j z)M(Qz_16Cd^wm>_?=2kKHe}-2n#^*uhpNl6#>WPY+*ICOn)p=DqjtMaP5Awmt|`k* zuBDetmRY4aV#Z2WW--SLwpJUtb5Wb8Jo#^ZS*+yc!(~f$IPBep{MxPIGnb4elozm1de|K*YkkfZ`KpHhaZgD;0T zRSJY=0T1A+6x(6i_!&SuhU=(i1@(d#K?oH>%Ql+@KtE!rZjUISLIIJ=+|m_7@G1s> zwqui&vUn%2>LHb-$sl=I0B1y!joX;co- zK*LX{0W$xjCV}Sw;4#nR8)ag}j|FBJ&lN+otgJ@^bch8znq4s)<@V={btHj_*CI$9 z5h{Eod*ul&1Bb@sHud0!bDBidg^3jdlZqyi2t7#>>LJjQ2h#N>v*dpbCp3^dWRNK& z4rwGgOWlin`OKs7^R#Q+%3Gl$s!0ndk_Gom%t$JA`{YOqKoBe+_`8xR!i0qnB!A#e CytvW; diff --git a/scripts/cpsr_validate_input.py b/scripts/cpsr_validate_input.py index 26ddb2c5..b90cf86a 100755 --- a/scripts/cpsr_validate_input.py +++ b/scripts/cpsr_validate_input.py @@ -230,7 +230,7 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, pcgr_directory, geno ## awk command to ignore secondary finding records while keeping records that belong to target (and that can potentially ## be part of the secondary findings list) - awk_command = "awk 'BEGIN{FS=\"\\t\"}{if($4 !~ /ACMG_SF/ || ($4 ~ /ACMG_SF/ && $4 ~ /" + str(ge_panel_identifier) + "/))print;}'" + awk_command = "awk 'BEGIN{FS=\"\\t\"}{if($4 !~ /ACMG_SF/ || ($4 ~ /ACMG_SF/ && $4 ~ /" + str(ge_panel_identifier) + ":/))print;}'" if gwas_findings == 0 and secondary_findings == 1: check_subprocess(logger, f'bgzip -dc {target_bed_gz} | egrep -v "(\|tag\|)" >> {virtual_panels_tmp_bed}', debug) elif gwas_findings == 0 and secondary_findings == 0: