Skip to content

Commit

Permalink
fix CSQ patterns and biomarker retrieval
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Dec 19, 2023
1 parent b60ea96 commit a3796c6
Show file tree
Hide file tree
Showing 12 changed files with 171 additions and 79 deletions.
2 changes: 1 addition & 1 deletion pcgr/annoutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def assign_cds_exon_intron_annotations(csq_record):
csq_record['LOSS_OF_FUNCTION'] = True

## Don't list LoF as True if consequence is assigned as missense
if re.search(pcgr_vars.CSQ_MISSENSE_PATTERN, csq_record['Consequence']) is not None:
if re.search(r'^missense_variant$', csq_record['Consequence']) is not None:
if csq_record['LOSS_OF_FUNCTION'] is True:
csq_record['LOSS_OF_FUNCTION'] = False

Expand Down
2 changes: 1 addition & 1 deletion pcgr/cpsr.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def get_args():
optional_classification.add_argument('--pop_gnomad',choices = ['afr','amr','eas','sas','asj','nfe','fin','global'], default='nfe', help='Population source in gnomAD (non-cancer subset) used for variant frequency assessment (ACMG classification), default: %(default)s')
optional_classification.add_argument('--maf_upper_threshold', type = float, default = 0.9, dest = 'maf_upper_threshold',help='Upper MAF limit (gnomAD global population frequency) for variants to be included in the report, default: %(default)s')
optional_classification.add_argument('--classify_all', action='store_true',dest='classify_all',help='Provide CPSR variant classifications (TIER 1-5) also for variants with existing ClinVar classifications in output TSV, default: %(default)s')
optional_classification.add_argument('--clinvar_report_noncancer', action='store_true', help='Report also ClinVar-classified variants attributed to phenotypes/conditions NOT related to cancer, default: %(default)s')
optional_classification.add_argument('--clinvar_report_noncancer', action='store_true', help='Report also ClinVar-classified variants attributed to phenotypes/conditions NOT directly related to tumor development, default: %(default)s')

optional_vcfanno.add_argument('--vcfanno_n_proc', default = 4, type = int, help="Number of vcfanno processes (option '-p' in vcfanno), default: %(default)s")

Expand Down
10 changes: 5 additions & 5 deletions pcgr/pcgr_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,9 @@
}

CSQ_MISSENSE_PATTERN = r"^missense_variant"
CSQ_CODING_PATTERN = r"^(stop_(lost|gained)|start_lost|frameshift_|missense_|splice_(donor|acceptor)|protein_altering|inframe_)"
CSQ_CODING_SILENT_PATTERN = r"^(stop_(lost|gained)|start_lost|frameshift_|missense_|splice_(donor|acceptor)|protein_altering|inframe_|synonymous|(start|stop)_retained)"
CSQ_NULL_PATTERN = r"^(stop_gained|frameshift_)"
CSQ_SPLICE_REGION_PATTERN = r"^(splice_|intron_variant)"
CSQ_SPLICE_DONOR_PATTERN = r"^(splice_region_variant|splice_donor_variant|splice_donor_region_variant|splice_donor_5th_base_variant)"
CSQ_CODING_PATTERN = r"(stop_(lost|gained)|start_lost|frameshift_|missense_|splice_(donor|acceptor)|protein_altering|inframe_)"
CSQ_CODING_SILENT_PATTERN = r"(stop_(lost|gained)|start_lost|frameshift_|missense_|splice_(donor|acceptor)|protein_altering|inframe_|synonymous|(start|stop)_retained)"
CSQ_NULL_PATTERN = r"(stop_gained|frameshift_)"
CSQ_SPLICE_REGION_PATTERN = r"(splice_|intron_variant)"
CSQ_SPLICE_DONOR_PATTERN = r"(splice_region_variant|splice_donor_variant|splice_donor_region_variant|splice_donor_5th_base_variant)"

1 change: 1 addition & 0 deletions pcgrr/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ export(dbsnp_germline_status)
export(deduplicate_eitems)
export(detect_vcf_sample_name)
export(df_string_replace)
export(expand_biomarker_items)
export(filter_eitems_by_site)
export(filter_read_support)
export(generate_annotation_link)
Expand Down
108 changes: 108 additions & 0 deletions pcgrr/R/biomarkers.R
Original file line number Diff line number Diff line change
Expand Up @@ -988,3 +988,111 @@ log_var_eitem_stats <- function(var_eitems = NULL,
}
}
}

#' Function that expands biomarker evidence items with variant annotations
#'
#' @param callset list object with 'variant' and 'biomarker_evidence' data
#' frames
#' @param variant_origin 'somatic' or 'germline'
#' @param target_genes data frame with target genes of interest
#'
#' @export
#'
expand_biomarker_items <- function(
callset = NULL,
variant_origin = "somatic",
target_genes = NULL){

if("variant" %in% names(callset) &
"biomarker_evidence" %in% names(callset)){

variant_properties <-
c("VAR_ID",
"GENOMIC_CHANGE",
"GENOME_VERSION",
"SAMPLE_ID",
"SYMBOL",
"ENTREZGENE",
"CONSEQUENCE",
"PROTEIN_CHANGE",
"MUTATION_HOTSPOT",
"CDS_CHANGE",
"LOSS_OF_FUNCTION",
"HGVSc",
"HGVSp",
"REFSEQ",
"OFFICIAL_GENENAME",
"PREDICTED_EFFECT",
"PROTEIN_DOMAIN",
"DBSNP",
"CLINVAR",
"COSMIC",
"VEP_ALL_CSQ")

if(variant_origin == "germline"){
variant_properties <- c(
variant_properties,
"CLINVAR_CLASSIFICATION",
"CPSR_CLASSIFICATION"
)
}
if(variant_origin == "somatic"){
variant_properties <- c(
variant_properties,
"DP_TUMOR",
"AF_TUMOR",
"DP_CONTROL",
"AF_CONTROL"
)
}

## check col existence callset[['variant]], variant_properties

for (type in c(pcgrr::evidence_types,
"all")) {
for (elevel in c("any", "A_B", "C_D_E")) {
if(NROW(callset[['biomarker_evidence']][[type]][[elevel]]) > 0){
callset[['biomarker_evidence']][[type]][[elevel]] <-
callset[['biomarker_evidence']][[type]][[elevel]] |>
dplyr::left_join(
dplyr::select(
callset[['variant']],
variant_properties),
by = c("VAR_ID")) |>
dplyr::arrange(
.data$EVIDENCE_LEVEL,
.data$PROTEIN_CHANGE,
dplyr::desc(
.data$RATING))

if(variant_origin == "germline"){
callset[['biomarker_evidence']][[type]][[elevel]] <-
callset[['biomarker_evidence']][[type]][[elevel]] |>
dplyr::filter(
(!is.na(CLINVAR_CLASSIFICATION) &
stringr::str_detect(
tolower(CLINVAR_CLASSIFICATION), "pathogenic")) |
(is.na(CLINVAR_CLASSIFICATION) &
!is.na(CPSR_CLASSIFICATION) &
stringr::str_detect(
tolower(CPSR_CLASSIFICATION), "pathogenic"))
)

if(NROW(callset[['biomarker_evidence']][[type]][[elevel]]) > 0 &
is.data.frame(target_genes) &
NROW(target_genes) > 0 &
"ENTREZGENE" %in% colnames(target_genes)){
callset[['biomarker_evidence']][[type]][[elevel]] <-
callset[['biomarker_evidence']][[type]][[elevel]] |>
dplyr::semi_join(target_genes, by = "ENTREZGENE")

}
}
}
}
}
}

return(callset)

}
74 changes: 17 additions & 57 deletions pcgrr/R/input_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,54 +54,10 @@ load_somatic_snv_indel <- function(
pcgrr::append_tfbs_annotation() |>
pcgrr::append_cancer_gene_evidence(ref_data = ref_data)

for (type in c("diagnostic", "prognostic",
"oncogenic", "functional",
"predictive","predisposition")) {
for (elevel in c("any", "A_B", "C_D_E")) {
if(NROW(callset[['biomarker_evidence']][[type]][[elevel]]) > 0){
callset[['biomarker_evidence']][[type]][[elevel]] <-
callset[['biomarker_evidence']][[type]][[elevel]] |>
dplyr::left_join(
dplyr::select(
callset[['variant']],
c("VAR_ID",
"GENOMIC_CHANGE",
"GENOME_VERSION",
"SYMBOL",
"ENTREZGENE",
"CONSEQUENCE",
"PROTEIN_CHANGE",
"MUTATION_HOTSPOT",
"CDS_CHANGE",
"LOSS_OF_FUNCTION",
"HGVSc",
"HGVSp",
"REFSEQ",
"OFFICIAL_GENENAME",
"PREDICTED_EFFECT",
"PROTEIN_DOMAIN",
"TARGETED_DRUGS",
"DBSNP",
"CLINVAR",
"COSMIC",
"VEP_ALL_CSQ",
"TCGA_FREQUENCY",
"DP_TUMOR",
"DP_CONTROL",
"AF_TUMOR",
"AF_CONTROL")
),
by = c("VAR_ID")
) |>
dplyr::arrange(
.data$EVIDENCE_LEVEL,
.data$PROTEIN_CHANGE,
dplyr::desc(
.data$RATING))
}
}
}

callset <-
pcgrr::expand_biomarker_items(
callset = callset,
variant_origin = "somatic")

return(callset)

Expand Down Expand Up @@ -199,7 +155,10 @@ load_dna_variants <- function(
results <- list()
results[['variant']] <- calls
results[['biomarker_evidence']] <- list()
results[['biomarker_evidence']][['all']] <- data.frame()
results[['biomarker_evidence']][['all']] <- list()
for (elevel in pcgrr::evidence_levels) {
results[['biomarker_evidence']][['all']][[elevel]] <- data.frame()
}

for (type in pcgrr::evidence_types) {
results[['biomarker_evidence']][[type]] <- list()
Expand Down Expand Up @@ -270,7 +229,7 @@ load_dna_variants <- function(
)

if(NROW(biomarker_set) > 0){
results[['biomarker_evidence']][['all']] <-
results[['biomarker_evidence']][['all']][['any']] <-
as.data.frame(
biomarker_set |>
dplyr::select(
Expand All @@ -279,9 +238,9 @@ load_dna_variants <- function(
) |>
dplyr::distinct() |>
tidyr::separate_rows(
.data$BIOMARKER_MATCH, sep=",") |>
"BIOMARKER_MATCH", sep=",") |>
tidyr::separate(
.data$BIOMARKER_MATCH,
"BIOMARKER_MATCH",
into = c("BIOMARKER_SOURCE",
"VARIANT_ID",
"EVIDENCE_ITEMS",
Expand Down Expand Up @@ -311,9 +270,9 @@ load_dna_variants <- function(
TRUE ~ as.character('other')
)) |>
tidyr::separate_rows(
.data$EVIDENCE_ITEMS, sep="&") |>
"EVIDENCE_ITEMS", sep="&") |>
tidyr::separate(
.data$EVIDENCE_ITEMS,
"EVIDENCE_ITEMS",
into = c("EVIDENCE_ID",
"PRIMARY_SITE",
"CLINICAL_SIGNIFICANCE",
Expand Down Expand Up @@ -347,16 +306,17 @@ load_dna_variants <- function(
"MOLECULAR_PROFILE_TYPE",
"RATING",
"EVIDENCE_DIRECTION")
), by = c("EVIDENCE_ID")
), by = c("EVIDENCE_ID"),
relationship = "many-to-many"
) |>
dplyr::distinct()
)

if(NROW(results[['biomarker_evidence']][['all']]) > 0){
if(NROW(results[['biomarker_evidence']][['all']][['any']]) > 0){

for (type in pcgrr::evidence_types) {
results[['biomarker_evidence']][[type]][["any"]] <-
results[['biomarker_evidence']][['all']] |>
results[['biomarker_evidence']][['all']][['any']] |>
dplyr::filter(
.data$VARIANT_ORIGIN == variant_origin &
.data$EVIDENCE_TYPE == stringr::str_to_title(type))
Expand Down
5 changes: 5 additions & 0 deletions pcgrr/R/report.R
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,11 @@ init_germline_content <- function(){
rep[["clin_eitem"]][[evidence_type]][[level]] <-
data.frame()
}
rep[['clin_eitem']][['all']] <- list()
for(level in pcgrr::evidence_levels){
rep[["clin_eitem"]][['all']][[level]] <-
data.frame()
}
}

for (cl in c("v_stat",
Expand Down
11 changes: 5 additions & 6 deletions pcgrr/data-raw/data-raw.R
Original file line number Diff line number Diff line change
Expand Up @@ -410,14 +410,13 @@ cancer_phenotypes_regex <-
"|neurofibro|keratoacan|nevus|brca|polyposis|myelodysplastic|cowden",
"|gardner|noonan|fanconi|carney|bullosa|schwanno|li-fraumeni|xeroderma",
"|leiomyom|muir-|nijmegen|neoplasia|trichoepithelioma|brooke|turcot",
"|exostoses|lynch|drash|wilm|perlman|fibrofolliculomas|hippel|hamartom",
"|bloom|werner|peutz|legius|tuberous|exostosis|angiomyolipoma",
"|lymphoproliferative|stat3|teratoma|thrombocytop|tp63|wiskott|weaver",
"|exostos|lynch|drash|wilm|perlman|fibrofolliculomas|hippel|hamartom",
"|bloom|werner|peutz|tuberous|angiomyolipoma",
"|lymphoproliferative|stat3|teratoma|thrombocytop|tp63|weaver",
"|pheochromo|gorlin|telangiectasia|hemangiom|osteochondro|response",
"|polg-related|ras-associated|dyskeratosis",
"|waardenburg|beckwidth|birt-hogg|costello|diamond|cardio-facio|frasier",
"|hirschsprung|hydrocephalus|hyperparathyroidism|immunodeficiency",
"|infantile myofibromatosis|leopard|proteus|rothmund|russel)")
"|waardenburg|beckwidth|birt-hogg|diamond|frasier",
"|infantile myofibromatosis|proteus|rothmund|russel)")
usethis::use_data(cancer_phenotypes_regex, overwrite = T)


Expand Down
Binary file modified pcgrr/data/cancer_phenotypes_regex.rda
Binary file not shown.
4 changes: 2 additions & 2 deletions pcgrr/man/assign_germline_popfreq_status.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions pcgrr/man/expand_biomarker_items.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 3 additions & 7 deletions scripts/cpsr.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,7 @@ if(!is.null(cps_report)){
cpsr::write_cpsr_output(
cps_report,
output_format = 'xlsx')
saveRDS(cps_report, file=file.path(
cps_report$settings$output_dir,
paste0(cps_report$settings$sample_id,
".cpsr.grch38.rds")))
#cpsr::write_cpsr_output(
# cps_report,
# output_format = 'html')
cpsr::write_cpsr_output(
cps_report,
output_format = 'html')
}

0 comments on commit a3796c6

Please sign in to comment.