diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py index 4531f85c..aded3dc1 100755 --- a/pcgr/cpsr.py +++ b/pcgr/cpsr.py @@ -44,9 +44,9 @@ def get_args(): optional_other.add_argument('--version', action='version', version=str(utils.get_cpsr_version())) optional_other.add_argument('--no_reporting',action="store_true",help="Run functional variant annotation on VCF through VEP/vcfanno, omit classification/report generation (STEP 4), default: %(default)s") optional_other.add_argument('--retained_info_tags', dest ='retained_info_tags', default='None', help='Comma-separated string of VCF INFO tags from query VCF that should be kept in CPSR output TSV') - #optional_other.add_argument('--report_theme',choices = ['default','cerulean','journal','flatly','readable','spacelab','united','cosmo','lumen','paper','sandstone','simplex','yeti'], default = 'default', help='Visual report theme (rmarkdown), default: %(default)s' ) - #optional_other.add_argument('--report_nonfloating_toc', action='store_true', help='Do not float the table of contents (TOC) in output HTML report, default: %(default)s') - #optional_other.add_argument('--report_table_display', choices = ['full','light'], default='light', help="Set the level of detail/comprehensiveness in interactive datables of HTML report, very comprehensive (option 'full') or slim/focused ('light'), default: %(default)s") + optional_other.add_argument('--report_theme',choices = ['default','cerulean','journal','flatly','readable','spacelab','united','cosmo','lumen','paper','sandstone','simplex','yeti'], default = 'default', help='Visual report theme (rmarkdown), default: %(default)s' ) + optional_other.add_argument('--report_nonfloating_toc', action='store_true', help='Do not float the table of contents (TOC) in output HTML report, default: %(default)s') + optional_other.add_argument('--report_table_display', choices = ['full','light'], default='light', help="Set the level of detail/comprehensiveness in interactive datables of HTML report, very comprehensive (option 'full') or slim/focused ('light'), default: %(default)s") optional_other.add_argument('--ignore_noncoding', action='store_true',dest='ignore_noncoding',default=False,help='Ignore non-coding (i.e. non protein-altering) variants in report, default: %(default)s') optional_other.add_argument("--debug", action="store_true", help="Print full commands to log") optional_other.add_argument("--pcgrr_conda", default="pcgrr", help="pcgrr conda env name (default: %(default)s)") @@ -282,7 +282,8 @@ def run_cpsr(conf_options, cpsr_paths): output_pass_vcf2tsv_gz, pcgr_db_dir = cpsr_paths["db_dir"], logger = logger) variant_set = variant.clean_annotations(variant_set, yaml_data, germline = True, logger = logger) variant_set.to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False) - utils.remove(output_pass_vcf2tsv_gz) + if not debug: + utils.remove(output_pass_vcf2tsv_gz) logger.info('Finished cpsr-summarise') diff --git a/pcgr/main.py b/pcgr/main.py index a65bcc55..3a0ec50d 100755 --- a/pcgr/main.py +++ b/pcgr/main.py @@ -452,7 +452,8 @@ def run_pcgr(pcgr_paths, conf_options): variant_set = variant.set_allelic_support(variant_set, allelic_support_tags = yaml_data["conf"]['somatic_snv']['allelic_support']) variant_set = variant.clean_annotations(variant_set, yaml_data, germline = False, logger = logger) variant_set.to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False) - utils.remove(output_pass_vcf2tsv_gz) + if not debug: + utils.remove(output_pass_vcf2tsv_gz) if yaml_data["conf"]['assay_properties']['type'] == 'WGS' or yaml_data["conf"]['assay_properties']['type'] == 'WES': # check that output file exist diff --git a/pcgr/variant.py b/pcgr/variant.py index bbd9c64e..aeb5ccc7 100644 --- a/pcgr/variant.py +++ b/pcgr/variant.py @@ -81,6 +81,7 @@ def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger): ## check number of variants with Ensembl gene ID's num_recs_with_entrez_hits = vcf2tsv_df["ENTREZGENE"].notna().sum() + #print(str(num_recs_with_entrez_hits)) ## merge variant set with ClinVar trait and variant origin annotations if num_recs_with_clinvar_hits > 0: if os.path.exists(clinvar_tsv_fname): @@ -133,7 +134,6 @@ def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger): usecols=["entrezgene","name"]) gene_xref_df = gene_xref_df[gene_xref_df['entrezgene'].notnull()].drop_duplicates() gene_xref_df["entrezgene"] = gene_xref_df["entrezgene"].astype("int64").astype("string") - #print(gene_xref_df.head) gene_xref_df.rename(columns = {'entrezgene':'ENTREZGENE', 'name':'GENENAME'}, inplace = True) vcf2tsv_df = vcf2tsv_df.merge(gene_xref_df, left_on=["ENTREZGENE"], right_on=["ENTREZGENE"], how="left") vcf2tsv_df["ENTREZGENE"] = vcf2tsv_df['ENTREZGENE'].str.replace("\\.[0-9]{1,}$", "", regex = True) @@ -253,8 +253,12 @@ def clean_annotations(variant_set: pd.DataFrame, yaml_data: dict, germline: bool variant_set['EFFECT_PREDICTIONS'] = variant_set['EFFECT_PREDICTIONS'].str.replace("\\.&|\\.$", "NA&", regex = True) variant_set['EFFECT_PREDICTIONS'] = variant_set['EFFECT_PREDICTIONS'].str.replace("&$", "", regex = True) variant_set['EFFECT_PREDICTIONS'] = variant_set['EFFECT_PREDICTIONS'].str.replace("&", ", ", regex = True) - variant_set.loc[variant_set['CLINVAR_CONFLICTED'] == 1, "CLINVAR_CONFLICTED"] = True - variant_set.loc[variant_set['CLINVAR_CONFLICTED'] != 1, "CLINVAR_CONFLICTED"] = False + variant_set['clinvar_conflicted_bool'] = True + variant_set.loc[variant_set['CLINVAR_CONFLICTED'] == 1, "clinvar_conflicted_bool"] = True + variant_set.loc[variant_set['CLINVAR_CONFLICTED'] != 1, "clinvar_conflicted_bool"] = False + variant_set.drop('CLINVAR_CONFLICTED', inplace=True, axis=1) + variant_set.rename(columns = {'clinvar_conflicted_bool':'CLINVAR_CONFLICTED'}, inplace = True) + if not {'VCF_SAMPLE_ID'}.issubset(variant_set.columns): variant_set['VCF_SAMPLE_ID'] = str(yaml_data['sample_id']) diff --git a/pcgrr/data-raw/data-raw.R b/pcgrr/data-raw/data-raw.R index 04d53667..4faf189b 100755 --- a/pcgrr/data-raw/data-raw.R +++ b/pcgrr/data-raw/data-raw.R @@ -64,11 +64,11 @@ usethis::use_data(color_palette, overwrite = T) #-----evidence types---------# evidence_types <- c("predictive","prognostic","diagnostic", "oncogenic","predisposing","functional") -usethis::use_data(evidence_types) +usethis::use_data(evidence_types, overwrite = T) #-----evidence levels---------# evidence_levels <- c("any","A_B","C_D_E") -usethis::use_data(evidence_levels) +usethis::use_data(evidence_levels, overwrite = T) #-----input column names/types-----# diff --git a/pcgrr/data/data_coltype_defs.rda b/pcgrr/data/data_coltype_defs.rda index 2e7197c1..fd1eff8b 100644 Binary files a/pcgrr/data/data_coltype_defs.rda and b/pcgrr/data/data_coltype_defs.rda differ diff --git a/scripts/cpsr_validate_input.py b/scripts/cpsr_validate_input.py index 26ddb2c5..b90cf86a 100755 --- a/scripts/cpsr_validate_input.py +++ b/scripts/cpsr_validate_input.py @@ -230,7 +230,7 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, pcgr_directory, geno ## awk command to ignore secondary finding records while keeping records that belong to target (and that can potentially ## be part of the secondary findings list) - awk_command = "awk 'BEGIN{FS=\"\\t\"}{if($4 !~ /ACMG_SF/ || ($4 ~ /ACMG_SF/ && $4 ~ /" + str(ge_panel_identifier) + "/))print;}'" + awk_command = "awk 'BEGIN{FS=\"\\t\"}{if($4 !~ /ACMG_SF/ || ($4 ~ /ACMG_SF/ && $4 ~ /" + str(ge_panel_identifier) + ":/))print;}'" if gwas_findings == 0 and secondary_findings == 1: check_subprocess(logger, f'bgzip -dc {target_bed_gz} | egrep -v "(\|tag\|)" >> {virtual_panels_tmp_bed}', debug) elif gwas_findings == 0 and secondary_findings == 0: