Skip to content

Commit

Permalink
Merge pull request #245 from sigven/chr_fix_patch
Browse files Browse the repository at this point in the history
Ensure proper exclusion of variants on non-nuclear chromosomes/scaffolds in pcgrr
  • Loading branch information
sigven authored Jul 7, 2024
2 parents c2d47f0 + e65f858 commit 7ba4b76
Show file tree
Hide file tree
Showing 13 changed files with 88 additions and 50 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](http://ca

### Top News

- *July 2024*: **2.0.1 release**
- patch with bug fix for mitochondrial input variants ([pr245](https://github.com/sigven/pcgr/pull/245))
- [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html)

- *June 2024*: **2.0.0 release**
- Details in [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html)
- Massive reference data bundle upgrade, new report layout, oncogenicity classification++
Expand Down
10 changes: 5 additions & 5 deletions pcgr/annoutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def threeToOneAA(aa_change):
return aa_change


def assign_cds_exon_intron_annotations(csq_record):
def assign_cds_exon_intron_annotations(csq_record, logger):


csq_record['CODING_STATUS'] = 'noncoding'
Expand Down Expand Up @@ -274,13 +274,13 @@ def assign_cds_exon_intron_annotations(csq_record):
cds_pos = cds_pos_full.split('-')[0]
if cds_pos.isdigit():
cds_pos = int(cds_pos)
else:
print('BALLE1 - ' + str(cds_pos_full) + ' - ' + str(cds_pos))
#else:
# logger.warning(f'Could not determine variant CDS position from VEP annotation - ({csq_record["CDS_position"]})')
else:
if cds_pos_full.isdigit():
cds_pos = int(cds_pos_full)
else:
print('BALLE2 - ' + str(cds_pos_full) + ' - ' + str(cds_pos))
#else:
# logger.warning(f'Could not determine variant CDS position from VEP annotation - ({csq_record["CDS_position"]})')

if int(cds_pos) > -1 and int(cds_pos) <= int(cds_length):
csq_record['CDS_RELATIVE_POSITION'] = float(cds_pos/cds_length)
Expand Down
2 changes: 1 addition & 1 deletion pcgr/vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def get_csq_record_annotations(csq_fields, varkey, logger, vep_csq_fields_map, t
csq_record['SYMBOL'] = transcript_xref_map[ensembl_transcript_id]['SYMBOL']

# Assign coding status, protein change, coding sequence change, last exon/intron status etc as VCF info tags
csq_record = assign_cds_exon_intron_annotations(csq_record)
csq_record = assign_cds_exon_intron_annotations(csq_record, logger)

return(csq_record)

Expand Down
2 changes: 1 addition & 1 deletion pcgrr/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export(dbsnp_germline_status)
export(deduplicate_eitems)
export(detect_vcf_sample_name)
export(df_string_replace)
export(exclude_non_chrom_variants)
export(expand_biomarker_items)
export(export_quarto_evars)
export(filter_eitems_by_site)
Expand All @@ -42,7 +43,6 @@ export(get_clin_assocs_cna)
export(get_dt_tables)
export(get_excel_sheets)
export(get_genome_obj)
export(get_ordinary_chromosomes)
export(get_prevalent_site_signatures)
export(get_valid_chromosomes)
export(het_af_germline_status)
Expand Down
6 changes: 4 additions & 2 deletions pcgrr/R/input_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ load_somatic_cna <- function(
.data$ACTIONABILITY_TIER,
dplyr::desc(.data$TISSUE_ASSOC_RANK),
dplyr::desc(.data$GLOBAL_ASSOC_RANK)) |>
pcgrr::order_variants(pos_var = 'SEGMENT_START')
pcgrr::order_variants(pos_var = 'SEGMENT_START') |>
pcgrr::exclude_non_chrom_variants()

pcgrr::log4r_info(
"Generating data frame with hyperlinked variant/gene annotations")
Expand Down Expand Up @@ -193,7 +194,8 @@ load_somatic_snv_indel <- function(
)
)) |>
dplyr::select(-c("tmp_HGVSc","ENST")) |>
pcgrr::order_variants(pos_var = 'POS')
pcgrr::order_variants(pos_var = 'POS') |>
pcgrr::exclude_non_chrom_variants()


## Tumor-only input
Expand Down
14 changes: 5 additions & 9 deletions pcgrr/R/mutational_signatures.R
Original file line number Diff line number Diff line change
Expand Up @@ -675,9 +675,7 @@ generate_report_data_rainfall <- function(variant_set,
build = NULL) {

pcg_report_rainfall <- pcgrr::init_rainfall_content()
if (NROW(variant_set) == 0) {
return(pcg_report_rainfall)
}


invisible(assertthat::assert_that
(assertthat::is.flag(autosomes),
Expand All @@ -695,6 +693,10 @@ generate_report_data_rainfall <- function(variant_set,
"') not allowed, available reference build values are:",
"'grch37' or 'grch38'")))

if (NROW(variant_set) == 0) {
return(pcg_report_rainfall)
}

pcgrr::log4r_info("------")
pcgrr::log4r_info(paste0("Calculating data for rainfall plot"))

Expand Down Expand Up @@ -745,12 +747,6 @@ generate_report_data_rainfall <- function(variant_set,
stringr::str_replace(.data$MUTATION_TYPE,
":[A-Z]>[A-Z]$", ""),
as.character(.data$MUTATION_TYPE))) |>
# dplyr::mutate(
# MUTATION_TYPE =
# dplyr::if_else(stringr::str_detect(.data$MUTATION_TYPE, "^A>"),
# stringr::str_replace(.data$MUTATION_TYPE,
# "^[A-Z]>[A-Z]:", ""),
# as.character(.data$MUTATION_TYPE))) |>
pcgrr::sort_chromosomal_segments()

bsg <- get_genome_obj(build)
Expand Down
47 changes: 30 additions & 17 deletions pcgrr/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ get_valid_chromosomes <- function(vcf_data_df,
#' @return vcf_df data frame with mutations from nuclear chromosomes only
#'
#' @export
get_ordinary_chromosomes <- function(vcf_df, chrom_var = "CHROM") {
exclude_non_chrom_variants <- function(vcf_df, chrom_var = "CHROM") {
invisible(assertthat::assert_that(
is.data.frame(vcf_df),
msg = "Argument 'vcf_df' must be of type data.frame"))
Expand All @@ -268,15 +268,19 @@ get_ordinary_chromosomes <- function(vcf_df, chrom_var = "CHROM") {
dplyr::mutate(
!!rlang::sym(chrom_var) := as.character(!!rlang::sym(chrom_var)))
n_before_exclusion <- nrow(vcf_df)
nuc_chromosomes_df <- data.frame(c(as.character(seq(1:22)), "X", "Y"),
stringsAsFactors = F)
nuc_chromosomes_df <- data.frame(
c(as.character(seq(1:22)), "X", "Y"),
stringsAsFactors = F)
colnames(nuc_chromosomes_df) <- c(chrom_var)
vcf_df <- dplyr::semi_join(vcf_df, nuc_chromosomes_df, by = chrom_var)
vcf_df <- dplyr::semi_join(
vcf_df, nuc_chromosomes_df, by = chrom_var)
n_after_exclusion <- nrow(vcf_df)
pcgrr::log4r_info(
paste0("Excluding ",
n_before_exclusion - n_after_exclusion,
" variants from non-nuclear chromosomes/scaffolds"))
if(n_before_exclusion - n_after_exclusion > 0){
pcgrr::log4r_info(
paste0("Excluding n = ",
n_before_exclusion - n_after_exclusion,
" variant(s) from non-nuclear chromosomes/scaffolds"))
}
return(vcf_df)

}
Expand All @@ -292,18 +296,27 @@ get_ordinary_chromosomes <- function(vcf_df, chrom_var = "CHROM") {
#' @export
order_variants <- function(
vcf_df, chrom_var = "CHROM", pos_var = "POS") {

stopifnot(is.data.frame(vcf_df) &
chrom_var %in% colnames(vcf_df) &
pos_var %in% colnames(vcf_df))
if (nrow(vcf_df) == 0)return(vcf_df)
vcf_df |>
dplyr::mutate(!!rlang::sym(chrom_var) :=
factor(!!rlang::sym(chrom_var),
ordered = T,
levels = c(as.character(seq(1:22)), "X", "Y"))) |>
dplyr::arrange(!!rlang::sym(chrom_var), !!rlang::sym(pos_var)) |>
dplyr::mutate(!!rlang::sym(chrom_var) :=
as.character(!!rlang::sym(chrom_var)))
if (nrow(vcf_df) == 0){
return(vcf_df)
}
vcf_df <- vcf_df |>
dplyr::mutate(
!!rlang::sym(chrom_var) :=
factor(!!rlang::sym(chrom_var),
ordered = T,
levels = c(as.character(seq(1:22)), "X", "Y"))) |>
dplyr::arrange(
!!rlang::sym(chrom_var),
!!rlang::sym(pos_var)) |>
dplyr::mutate(
!!rlang::sym(chrom_var) :=
as.character(!!rlang::sym(chrom_var)))

return(vcf_df)
}


Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pcgrr/pkgdown/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ PCGR originates from the [Norwegian Cancer Genomics Consortium (NCGC)](http://ca

### Top News

- *July 2024*: **2.0.1 release**
- patch with bug fix for mitochondrial input variants ([pr245](https://github.com/sigven/pcgr/pull/245))
- [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html)

- *June 2024*: **2.0.0 release**
- Details in [CHANGELOG](http://sigven.github.io/pcgr/articles/CHANGELOG.html)
- Massive reference data bundle upgrade, new report layout, oncogenicity classification++
Expand Down
5 changes: 5 additions & 0 deletions pcgrr/vignettes/CHANGELOG.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ sigven <- user("sigven")
pdiakumis <- user("pdiakumis")
```

## v2.0.1

- Date: **2024-07-07**
- Fixed bug for chrM variants in input - not properly annotated by VEP, and not correctly processed in `pcgrr`. Any mitochondrial variants found in input VCF are now removed during VCF pre-processing.

## v2.0.0

- Date: **2024-06-26**
Expand Down
8 changes: 4 additions & 4 deletions pcgrr/vignettes/annotation_resources.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ __Genomic biomarkers__

Genomic biomarkers included in PCGR are limited to the following:

* Evidence items for specific markers in CIViC must be *accepted* (*submitted* evidence items are not considered)
* Markers reported at the gene level (e.g. __BRAF mutation__, __BRCA1 oncogenic mutation__)
* Markers reported at the variant level (e.g. __BRAF p.V600E__)
* Evidence items for specific markers in CIViC must be *accepted* (*submitted* evidence items are not considered or shown)
* Markers reported at the exact variant level (e.g. __BRAF p.V600E__, __MET c.3028+1G>T, __g.7:140753336A>T__)
* Markers reported at the codon level (e.g. __KRAS p.G12__)
* Markers reported at the exon/gene level (e.g. __KIT exon 11 mutation__, __BRCA1/2 oncogenic mutations__)
* Markers reported at the exon level (e.g. __KIT exon 11 mutation__, __EGFR exon 19 deletion__)
* Markers reported at the gene level (e.g. __BRAF mutation__, __TP53 loss-of-function mutation__, __BRCA1 oncogenic mutation__)
* Within the [Cancer bioMarkers database (CGI)](https://www.cancergenomeinterpreter.org/biomarkers), only markers collected from FDA/NCCN guidelines, scientific literature, and clinical trials are included (markers collected from conference abstracts etc. are not included)
* Copy number gains/losses

Expand Down
9 changes: 6 additions & 3 deletions scripts/cpsr_validate_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,11 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_assembly_dir
cmd_vcf1 = f'bcftools view {input_vcf} | bgzip -cf > {temp_files["vcf_2"]} && tabix -p vcf {temp_files["vcf_2"]} && ' + \
f'bcftools sort --temp-dir {output_dir} -Oz {temp_files["vcf_2"]} > {temp_files["vcf_3"]} 2> {bcftools_simplify_log}' + \
f' && tabix -p vcf {temp_files["vcf_3"]}'
logger.info('Extracting variants on autosomal/sex/mito chromosomes only (1-22,X,Y, M/MT) with bcftools')
# Keep only autosomal/sex/mito chrom (handle hg38 and hg19), sub chr prefix
chrom_to_keep = [str(x) for x in [*range(1,23), 'X', 'Y', 'M', 'MT']]
logger.info('Extracting variants on autosomal/sex/mito chromosomes only (1-22,X,Y) with bcftools')
# Keep only autosomal/sex chromosomes, sub chr prefix
# Note: M/MT variants are skipped - requires additional cache/handling from VEP,
# see e.g. https://github.com/Ensembl/ensembl-vep/issues/464
chrom_to_keep = [str(x) for x in [*range(1,23), 'X', 'Y',]]
chrom_to_keep = ','.join([*['chr' + chrom for chrom in chrom_to_keep], *[chrom for chrom in chrom_to_keep]])
cmd_vcf2 = f'bcftools view --regions {chrom_to_keep} {temp_files["vcf_3"]} | sed \'s/^chr//\' > {temp_files["vcf_1"]}'

Expand All @@ -198,6 +200,7 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, refdata_assembly_dir
command_decompose = f'vt decompose -s {temp_files["vcf_1"]} > {temp_files["vcf_4"]} 2> {vt_decompose_log}'
check_subprocess(logger, command_decompose, debug)
else:
logger.info('All sites seem to be decomposed - skipping decomposition of multiallelic sites')
command_copy = f'cp {temp_files["vcf_1"]} {temp_files["vcf_4"]}'
check_subprocess(logger, command_copy, debug)

Expand Down
21 changes: 16 additions & 5 deletions scripts/pcgr_validate_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,13 +192,15 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, output_dir, sample_id, keep_unco
variant_id = f"{rec.CHROM}:{POS}_{rec.REF}->{alt}"
multiallelic_list.append(variant_id)

logger.info('Extracting variants on autosomal/sex/mito chromosomes only (1-22,X,Y, M/MT) with bcftools')
logger.info('Extracting variants on autosomal/sex chromosomes only (1-22,X,Y) with bcftools')
# bgzip + tabix required for sorting
cmd_vcf1 = f'bcftools view {input_vcf} | bgzip -cf > {temp_files["vcf_2"]} && tabix -p vcf {temp_files["vcf_2"]} && ' + \
f'bcftools sort --temp-dir {output_dir} -Oz {temp_files["vcf_2"]} > {temp_files["vcf_3"]} 2> {bcftools_simplify_log} && ' + \
f'tabix -p vcf {temp_files["vcf_3"]}'
# Keep only autosomal/sex/mito chrom (handle hg38 and hg19), remove FORMAT metadata lines, keep cols 1-8, sub chr prefix
chrom_to_keep = [str(x) for x in [*range(1,23), 'X', 'Y', 'M', 'MT']]
# Keep only autosomal/sex chrom, remove FORMAT metadata lines, keep cols 1-8, sub chr prefix
# Note: any M/MT variants listed in input are skipped - requires additional cache/handling from VEP,
# see e.g. https://github.com/Ensembl/ensembl-vep/issues/464
chrom_to_keep = [str(x) for x in [*range(1,23), 'X', 'Y']]
chrom_to_keep = ','.join([*['chr' + chrom for chrom in chrom_to_keep], *[chrom for chrom in chrom_to_keep]])
cmd_vcf2 = f'bcftools view --regions {chrom_to_keep} {temp_files["vcf_3"]} | egrep -v \'^##FORMAT=\' ' + \
f'| cut -f1-8 | sed \'s/^chr//\' > {temp_files["vcf_1"]}'
Expand All @@ -215,7 +217,7 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, output_dir, sample_id, keep_unco
command_decompose = f'vt decompose -s {temp_files["vcf_1"]} > {validated_vcf} 2> {vt_decompose_log}'
check_subprocess(logger, command_decompose, debug)
else:
logger.info('All sites seem to be decomposed - skipping decomposition!')
logger.info('All sites seem to be decomposed - skipping decomposition of multiallelic sites')
check_subprocess(logger, f'cp {temp_files["vcf_1"]} {validated_vcf}', debug)

# need to keep uncompressed copy for vcf2maf.pl if selected
Expand All @@ -230,8 +232,17 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, output_dir, sample_id, keep_unco
i = i + 1
if len(vcf.seqnames) == 0 or i == 0:
logger.info('')
logger.info("Input VCF contains NO valid variants after VCF cleaning - quitting workflow")
logger.info("Input VCF contains NO valid variants on autosomal/sex chromosomes after VCF cleaning - quitting workflow")
logger.info('')

if not debug:
remove_file(temp_files["vcf_1"])
remove_file(temp_files["vcf_2"])
remove_file(temp_files["vcf_3"])
remove_file(temp_files["vcf_2"] + str('.tbi'))
remove_file(temp_files["vcf_3"] + str('.tbi'))
remove_file(bcftools_simplify_log)
remove_file(vt_decompose_log)
exit(1)

if not debug:
Expand Down

0 comments on commit 7ba4b76

Please sign in to comment.