diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml index 035d1cc84..9ac974da2 100644 --- a/.github/.dockstore.yml +++ b/.github/.dockstore.yml @@ -141,6 +141,7 @@ workflows: filters: branches: - main + - kj_sv_cleanvcf tags: - /.*/ diff --git a/dockerfiles/sv-pipeline/Dockerfile b/dockerfiles/sv-pipeline/Dockerfile index d4f9aa687..5d9c759c7 100644 --- a/dockerfiles/sv-pipeline/Dockerfile +++ b/dockerfiles/sv-pipeline/Dockerfile @@ -70,13 +70,9 @@ RUN plink2 || true # -Compile StitchFragmentedCNVs Java program # -Compile StitchFragmentedCNVs unit tests # -Compile VCFParser unit tests -# -Compile and test CleanVCFPart1 Java program -# -Compile and test CleanVCFPart1 unit tests ENV STITCH_JAR="/opt/sv-pipeline/java/build/StitchFragmentedCNVs.jar" ARG STITCH_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/StitchFragmentedCNVsUnitTest.jar" ARG VCF_PARSER_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/VCFParserUnitTest.jar" -ENV CLEAN_VCF_PART_1_JAR="/opt/sv-pipeline/java/build/CleanVCFPart1.jar" -ARG CLEAN_VCF_PART_1_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/CleanVCFPart1UnitTest.jar" ARG BUILD_DEPS="openjdk-8-jdk" ARG DEBIAN_FRONTEND=noninteractive RUN export APT_TRANSIENT_PACKAGES=$(diff_of_lists.sh "$BUILD_DEPS" $APT_REQUIRED_PACKAGES) && \ @@ -97,14 +93,6 @@ RUN export APT_TRANSIENT_PACKAGES=$(diff_of_lists.sh "$BUILD_DEPS" $APT_REQUIRED echo "Running VCFParserUnitTest..." && \ java -enableassertions -jar $VCF_PARSER_UNIT_TEST_JAR && \ rm -rf build/classes/* $VCF_PARSER_UNIT_TEST_JAR && \ - javac -d build/classes org/broadinstitute/svpipeline/CleanVCFPart1.java org/broadinstitute/svpipeline/VCFParser.java && \ - jar cfe build/CleanVCFPart1.jar "org.broadinstitute.svpipeline.CleanVCFPart1" -C build/classes . && \ - rm -rf build/classes/* && \ - javac -d build/classes org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java org/broadinstitute/svpipeline/CleanVCFPart1.java org/broadinstitute/svpipeline/VCFParser.java && \ - jar cfe build/CleanVCFPart1UnitTest.jar "org.broadinstitute.svpipeline.CleanVCFPart1UnitTest" -C build/classes . && \ - echo "Running CleanVCFPart1UnitTest..." && \ - java -enableassertions -jar $CLEAN_VCF_PART_1_UNIT_TEST_JAR && \ - rm -rf build/classes/* $CLEAN_VCF_PART_1_UNIT_TEST_JAR && \ apt-get -qqy remove --purge $APT_TRANSIENT_PACKAGES && \ apt-get -qqy autoremove --purge && \ apt-get -qqy clean && \ diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index b68084b91..0b92b966f 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -12,8 +12,8 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52", - "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-11-15-v1.0-488d7cb0", - "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-11-15-v1.0-488d7cb0", + "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79", + "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79", "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", @@ -28,5 +28,5 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a", "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6", - "denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2024-11-15-v1.0-488d7cb0" + "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79" } \ No newline at end of file diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py deleted file mode 100644 index b7da153cb..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Remove CNVs that are improperly genotyped by depth because they are nested -within a real CNV -""" - -import logging -import pybedtools -import pysam -import sys -import json - -from collections import defaultdict - -SVTYPE = "SVTYPE" -BLANK_SAMPLES = "blanksample" - - -class SVType: - DUP = "DUP" - DEL = "DEL" - - -class VariantFormatTypes: - # Predicted copy state - RD_CN = "RD_CN" - # Classes of evidence supporting final genotype - EV = "EV" - - -class VCFReviser: - def __init__(self): - self.rd_cn = {} - self.sample_indices_dict = {} - self.sample_list = [] - - def _update_rd_cn(self, variant, sample_indices): - self.rd_cn[variant.id] = {s: variant.samples[s][VariantFormatTypes.RD_CN] for s in sample_indices} - - @staticmethod - def get_wider(f): - # f[1] : first interval start - # f[2] : first interval end - # f[7] : second interval start - # f[8] : second interval end - if int(f[2]) - int(f[1]) >= int(f[8]) - int(f[7]): - return f[0:6], f[6:12] - else: - return f[6:12], f[0:6] - - @staticmethod - def get_coverage(wider, narrower): - n_start = int(narrower[1]) - n_stop = int(narrower[2]) - w_start = int(wider[1]) - w_stop = int(wider[2]) - - coverage = 0 - if w_start <= n_stop and n_start <= w_stop: - intersection_size = min(n_stop, w_stop) - max(n_start, w_start) - coverage = intersection_size / (n_stop - n_start) - return coverage - - def get_geno_normal_revise(self, vcf_file, bed_file): - overlap_test_text = defaultdict(dict) - with pysam.VariantFile(vcf_file, "r") as f: - header = f.header - i = -1 - for sample in header.samples: - i += 1 - self.sample_indices_dict[sample] = i - self.sample_list.append(sample) - - logging.info("Filtering intersect results") - bed = pybedtools.BedTool(bed_file) - for interval in bed.intervals: - wider, narrower = self.get_wider(interval.fields) - # wider and narrower are lists/tuples with the following fields: - # [0] : contig - # [1] : start position - # [2] : end position - # [3] : variant ID - # [4] : SV type - # [5] : comma-delimited sample lists, or BLANK_SAMPLES if none - if wider[5] == BLANK_SAMPLES: - continue - - coverage = self.get_coverage(wider, narrower) - if coverage >= 0.5: - wider_samples = set(wider[5].split(",")) - narrower_samples = set(narrower[5].split(",")) - non_common_samples = [self.sample_indices_dict[s] for s in wider_samples - narrower_samples] - for x in non_common_samples: - vid = narrower[3] - overlap_test_text[vid][x] = (wider[3], wider[4]) - - # Determine for which vid/sample pairs we need RD_CN - # Substantially reduces memory - logging.info('Getting revised variant IDs') - revise_vids = defaultdict(set) - for var_id, samples_dict in overlap_test_text.items(): - for sample_index, v in samples_dict.items(): - # v[0] : variant ID - # v[1] : SV type - if v[1] == SVType.DUP or v[1] == SVType.DEL: - revise_vids[var_id].add(sample_index) - revise_vids[v[0]].add(sample_index) - - logging.info('Getting RD_CN/EV') - for variant in f: - if variant.id in revise_vids: - sample_indices = revise_vids[variant.id] - self._update_rd_cn(variant, sample_indices) - - logging.info('Generating geno_normal_revise_dict') - geno_normal_revise_dict = {} - for var_id, samples_dict in overlap_test_text.items(): - for sample_index, v in samples_dict.items(): - # v[0] : variant ID - # v[1] : SV type - new_val = None - if sample_index not in revise_vids[v[0]]: - sys.stderr.write("{} {}\n".format(sample_index, v[0])) - if v[1] == SVType.DUP and \ - self.rd_cn[var_id][sample_index] == 2 and \ - self.rd_cn[v[0]][sample_index] == 3: - new_val = 1 - elif v[1] == SVType.DEL and \ - self.rd_cn[var_id][sample_index] == 2 \ - and self.rd_cn[v[0]][sample_index] == 1: - new_val = 3 - - if new_val: - if var_id not in geno_normal_revise_dict: - geno_normal_revise_dict[var_id] = {} - sample_id = self.sample_list[sample_index] - geno_normal_revise_dict[var_id][sample_id] = new_val - - return geno_normal_revise_dict - - -def main(args): - logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) - logging.info('Starting script') - reviser = VCFReviser() - filtered_vcf = args[1] - intersected_bed = args[2] - geno_normal_revise_dict = reviser.get_geno_normal_revise(filtered_vcf, intersected_bed) - logging.info('Dumping dictionary') - sys.stdout.write(json.dumps(geno_normal_revise_dict)) - logging.info('Done') - - -if __name__ == '__main__': - main(sys.argv) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py deleted file mode 100644 index e63b890cd..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -Remove CNVs that are improperly genotyped by depth because they are nested -within a real CNV -""" - -import os -import logging -import pysam -import sys -from pathlib import Path -import json -import gzip - -SVTYPE = "SVTYPE" -BLANK_SAMPLES = "B" - - -class SVType: - DUP = "DUP" - DEL = "DEL" - - -class VariantFormatTypes: - # Predicted copy state - RD_CN = "RD_CN" - # Classes of evidence supporting final genotype - EV = "EV" - - -def modify_variants(dict_file_gz, vcf, multi_cnvs): - logging.info('Loading dictionary') - with gzip.open(dict_file_gz, 'rt') as f: - geno_normal_revise_dict = json.load(f) - - logging.info('Filtering variants') - with pysam.VariantFile(vcf, "r") as f_in: - header = f_in.header - sys.stdout.write(str(header)) - with open(multi_cnvs, "w") as multi_cnvs_f: - variants = f_in.fetch() - for variant in variants: - if variant.id in geno_normal_revise_dict: - for sample_id in geno_normal_revise_dict[variant.id]: - o = variant.samples[sample_id] - o.update({"GT": (0, 1)}) - o.update({"GQ": o["RD_GQ"]}) - - if variant.stop - variant.start >= 1000: - if variant.info[SVTYPE] in [SVType.DEL, SVType.DUP]: - is_del = variant.info[SVTYPE] == SVType.DEL - for k, v in variant.samples.items(): - rd_cn = v[VariantFormatTypes.RD_CN] - if rd_cn is None: - continue - if (is_del and rd_cn > 3) or \ - (not is_del and (rd_cn < 1 or rd_cn > 4)): - multi_cnvs_f.write(variant.id + "\n") - break - - sys.stdout.write(str(variant)) - - -def ensure_file(filename): - filename = os.path.join(".", filename) - filename = Path(filename) - if filename.exists(): - os.remove(filename) - return filename.name - - -def main(args): - logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) - logging.info('Starting script') - multi_cnvs_filename = ensure_file("multi.cnvs.txt") - dict_file_gz = args[1] - vcf_file = args[2] - modify_variants(dict_file_gz, vcf_file, multi_cnvs_filename) - logging.info('Done') - - -if __name__ == '__main__': - main(sys.argv) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh deleted file mode 100755 index 5d4827fa5..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh +++ /dev/null @@ -1,171 +0,0 @@ -#!/bin/bash -# -# clean_vcf_part2.sh -# - -set -euo pipefail - -##bgzipped combined vcf from clean vcf part1b.sh## -normal_revise_vcf_gz=$1 -##whitelist of split ids for parallelization## -whitelist=$2 -##list of multiallelic CNVs from 1b## -multi_cnv=$3 -##output filename## -outputfile=$4 - -export LC_ALL=C - -##subset vcf to whitelist samples## -bcftools view $normal_revise_vcf_gz -S $whitelist --no-update \ - | gzip \ - > subset.vcf.gz - - -##create new bed with updated genotypes### -zcat subset.vcf.gz \ - | awk 'BEGIN{FS=OFS="\t"}{if (substr($1,1,1)=="#" || $5=="" || $5=="") print}' \ - | svtk vcf2bed stdin stdout \ - | sed 1d \ - | sort -k4,4 \ - | gzip \ - > int.afternormalfix.bed.gz - -##Find overlapping depth based variants and reassign depth based; note this is necessary because depth call >5kb genotypes are 100% driven by depth ## -##generate a sample list based on depth for depth overlap check below. Necessary because genotype is capped at 1/1 and by direction (i.e no dels in dups)## -##grab all samples per variant with a non normal copy state## -zcat subset.vcf.gz \ - | awk 'BEGIN{FS=OFS="\t"}{if (substr($1,1,1)=="#") print; else if ($5=="" || $5=="") {$1=$3; print}}' \ - | vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \ - | awk 'BEGIN{FS=OFS="\t"} NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1,header[j],$j }' \ - | sort -k1,1 \ - | awk 'BEGIN{FS=OFS="\t"} \ - {if (($1~"DEL" && $3<2 && $3!=".") || ($1~"DUP" && $3>2 && $3!=".")) a[$1]=a[$1]?a[$1]","$2:$2} \ - END{for (i in a) print i,a[i]}' \ - | sort -k1,1 \ - > afternormal.combined.RD_CN.list.txt - -##get a list of samples for actual variants not just those with aberrant copy states## -zcat int.afternormalfix.bed.gz \ - | awk 'BEGIN{FS=OFS="\t"} $6!="" {split($6,samples,","); for (i in samples) print $4"@"samples[i]}' \ - | sort -k1,1 \ - > fullvar.afternormal.list.txt - -##create bed with anything that has abnormal copy state## -##do not compress all.bed because of bedtools bug: https://github.com/arq5x/bedtools2/issues/643## -zcat int.afternormalfix.bed.gz \ - | cut -f1-5 \ - | awk 'BEGIN{FS=OFS="\t"}{if($3-$2>=5000)print $0}' \ - | join -1 4 -2 1 -t ' ' - afternormal.combined.RD_CN.list.txt \ - | awk 'BEGIN {FS=OFS="\t"} \ - $6!=""{split($6,samples,","); \ - for (i in samples) {s = samples[i]; print $2"_"s,$3,$4,$1,$5,s,$1"@"s}}' \ - | sort -k7,7 \ - | join -t ' ' -a 1 -1 7 -2 1 -e "NA" -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.1 - fullvar.afternormal.list.txt \ - > all.bed - -##intersect variants and always set larger to left## -bedtools intersect -wa -wb -a all.bed -b all.bed \ - | awk 'BEGIN{FS=OFS="\t"} \ - {if ($8=="NA") { if ($16!="NA") print $9,$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7; } \ - else if ($4!=$12) { - if ($3-$2>=$11-$10) print $1,$2,$3,$4,$5,$6,$7,$9,$10,$11,$12,$13,$14,$15; \ - else if ($16!="NA") print $9,$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7; \ - else print $1,$2,$3,$4,$5,$6,$7,$9,$10,$11,$12,$13,$14,$15;}}' \ - | sort -k7,7 \ - | uniq \ - | gzip \ - > bed.overlap.txt.gz - -zcat subset.vcf.gz \ - | grep '^#' \ - > vcf_header -zcat bed.overlap.txt.gz \ - | awk '{print $4; print $11}' \ - | sort -u \ - > overlap.events -echo '~~~' >> overlap.events -zcat subset.vcf.gz \ - | grep -v '^#' \ - | sort -k3,3 \ - | join -t ' ' -1 1 -2 3 overlap.events - \ - | awk 'BEGIN{FS=OFS="\t"}{tmp=$2;$2=$3;$3=tmp;print}' \ - | sort -k2n,2 \ - > overlap.events.vcf - -##get info for each variant## -zcat bed.overlap.txt.gz \ - | awk 'BEGIN{FS=OFS="\t"}{print $7; print $14;}' \ - | sort -u \ - > combined.bed -for var in EV RD_CN GT -do - echo '~~~' >> combined.bed - cat vcf_header overlap.events.vcf \ - | vcftools --vcf - --stdout --extract-FORMAT-info ${var} \ - | awk 'BEGIN{FS=OFS="\t"} \ - NR==1{for (i=3;i<=NF;i++) header[i]=$i} \ - NR>1 {for (j=3;j<=NF;j++) print $1"@"header[j],$j}' \ - | sort -k1,1 \ - | join -t ' ' -1 1 -2 1 combined.bed - \ - > combined.bed.tmp - mv combined.bed.tmp combined.bed -done - -zcat bed.overlap.txt.gz \ - | join -t ' ' -1 7 -2 1 - combined.bed \ - | cut -f2- \ - | sort -k13,13 \ - | join -t ' ' -1 13 -2 1 - combined.bed \ - | cut -f2- \ - | awk 'BEGIN{FS=OFS="\t"}{print $3-$2,$9-$8,$0}' \ - | sort -k1nr,1 -k2nr,2 \ - | cut -f3- \ - | gzip \ - > all.combined.bed.gz - -zcat all.combined.bed.gz \ - | awk -v multiCNVFile=$multi_cnv ' \ - function makeRevision( id, val ) { reviseCN[id] = val; if ( val == 2 ) wasRevisedToNormal[id] = 1; } \ - BEGIN \ - {FS=OFS="\t"; \ - while ( getline < multiCNVFile ) multiCNV[$0] = 1; \ - close(multiCNVFile)} \ - {chr_sample1 = $1; start1 = $2; stop1 = $3; ev1 = $4; svtype1 = $5; sample1 = $6; \ - chr_sample2 = $7; start2 = $8; stop2 = $9; ev2 = $10; svtype2 = $11; sample2 = $12; \ - support1 = $13; RD_CN1 = $14; GT1 = $15; support2 = $16; RD_CN2 = $17; GT2 = $18; \ - id1 = ev1"@"sample1; id2 = ev2"@"sample2; \ - length1 = stop1 - start1; length2 = stop2 - start2; \ - if ( id1 in wasRevisedToNormal ) next; \ - if ( id1 in reviseCN ) RD_CN1 = reviseCN[id1]; \ - if ( id2 in reviseCN ) RD_CN2 = reviseCN[id2]; \ - overlap = (stop1 < stop2 ? stop1 : stop2) - (start1 > start2 ? start1 : start2); \ - smallOverlap50 = overlap/length2 > .5; \ - largeOverlap50 = overlap/length1 > .5; \ - ##Call where smaller depth call is being driven by larger## \ - if ( support1 ~ /RD/ && support1 != "RD" && support2 == "RD" && smallOverlap50 && !(ev1 in multiCNV) ) { \ - if ( RD_CN1 == 0 ) makeRevision(id2, RD_CN2 + 2); \ - else if ( RD_CN1 == 1 ) makeRevision(id2, RD_CN2 + RD_CN1); \ - else if ( RD_CN1 > 1 ) { newCN = RD_CN2 - RD_CN1 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id2, newCN); } } \ - ##Smaller CNV driving larger CNV genotype## \ - else if ( support1 == "RD" && support2 ~ /RD/ && support2 != "RD" && smallOverlap50 && !(ev2 in multiCNV) && GT2 != "0/0" && largeOverlap50 ) { \ - if ( RD_CN2 == 0 ) makeRevision(id1, RD_CN1 + 2); \ - else if ( RD_CN2 == 1 ) makeRevision(id1, RD_CN1 + RD_CN2); \ - else if ( RD_CN2 > 1 ) { newCN = RD_CN1 - RD_CN2 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id1, newCN); } } \ - ##Depth only calls where smaller call is being driven by larger## \ - else if ( support1 == "RD" && support2 == "RD" && smallOverlap50 && svtype1 == svtype2 && !(ev1 in multiCNV) ) { \ - if ( RD_CN1 == 0 && RD_CN1 != RD_CN2 ) makeRevision(id2, RD_CN2 + 2); \ - else if ( RD_CN1 == 1 && RD_CN1 > RD_CN2 ) makeRevision(id2, 1); \ - else if ( RD_CN1 > 1 && RD_CN1 < RD_CN2 ) { newCN = RD_CN2 - RD_CN1 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id2, newCN); } \ - else makeRevision(id2, 2); } \ - ##Any other time a larger call is driving a smaller call## \ - else if ( support1 ~ /RD/ && smallOverlap50 && length2 > 5000 && !(ev1 in multiCNV) ) { \ - if ( RD_CN1 == 0 ) makeRevision(id2, RD_CN2 + 2); \ - else if ( RD_CN1 == 1 ) makeRevision(id2, RD_CN2 + RD_CN1); \ - else if ( RD_CN1 > 1 ) { newCN = RD_CN2 - RD_CN1 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id2, newCN); } } } \ - END \ - {for ( id in reviseCN ) print id,reviseCN[id]; }' \ - | sed 's/@/ /' \ - | sort \ - | awk 'BEGIN{FS=OFS="\t"}{if ($3<0) $3=0; print $0}' \ - > $outputfile diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py deleted file mode 100755 index ad2b744a5..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python - -import argparse -import sys -import svtk.utils as svu - - -def process_features_for_size1(features_for_size1, redundant_multiallelics): - for intersection in sorted(features_for_size1, key=lambda x: int(x[9]) - int(x[8]), reverse=True): - b_len = int(intersection.fields[9]) - int(intersection.fields[8]) - overlap = int(intersection.fields[14]) - small_coverage = overlap / b_len - if small_coverage > 0.50: - if intersection.fields[3] not in redundant_multiallelics: - redundant_multiallelics.add(intersection.fields[10]) - - -def main(): - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('multiallelic_filename') - parser.add_argument('fout') - args = parser.parse_args() - - print("finding redundant overlapping sites", file=sys.stderr) - multiallelic_bed = svu.vcf2bedtool(args.multiallelic_filename, include_filters=True) - - redundant_multiallelics = set() - # feature fields: - # [1] : first interval start - # [2] : first interval end - # [3] : first interval variant ID - # [8] : second interval start - # [9] : second interval end - # [10] : second interval variant ID - self_inter = multiallelic_bed.intersect(multiallelic_bed, wo=True)\ - .filter(lambda feature: feature[3] != feature[10]) \ - .filter(lambda feature: (int(feature[2]) - int(feature[1])) >= (int(feature[9]) - int(feature[8]))) \ - .sort(sizeD=True) - current_size1 = -1 - features_for_size1 = [] - for feature in self_inter: - size1 = int(feature[2]) - int(feature[1]) - if size1 != current_size1: - process_features_for_size1(features_for_size1, redundant_multiallelics) - features_for_size1 = [] - - current_size1 = size1 - features_for_size1.append(feature) - - process_features_for_size1(features_for_size1, redundant_multiallelics) - print("identified {} redundant multiallelic sites".format(len(redundant_multiallelics)), file=sys.stderr) - with open(args.fout, "w") as list_file: - for vid in redundant_multiallelics: - print(vid, file=list_file) - - -if __name__ == '__main__': - main() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py deleted file mode 100755 index a5bf42982..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env python - -import argparse -from collections import Counter -import gzip -import pysam -import sys -import svtk.utils as svu - - -def main(): - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('revise_vcf_lines', type=argparse.FileType('r')) - parser.add_argument('normal_revise_vcf') - parser.add_argument('famfile', type=argparse.FileType('r')) - parser.add_argument('sexchr_revise') - parser.add_argument('multi_geno_ids_txt') - parser.add_argument('outlier_samples_list', type=argparse.FileType('r')) - parser.add_argument('out_prefix') - parser.add_argument('--threads_per_file', required=False, default=2, type=int) - args = parser.parse_args() - - # load the revised lines and index by ID - with pysam.VariantFile(args.revise_vcf_lines, threads=args.threads_per_file) as revise_vcf: - header2 = revise_vcf.header - revised_lines_by_id = {record.id: record for record in revise_vcf} - print("loaded {} revised lines".format(len(revised_lines_by_id)), file=sys.stderr) - - outlier_samples = set([line.rstrip() for line in args.outlier_samples_list if not line.isspace()]) - print("loaded {} outlier samples".format(len(outlier_samples)), file=sys.stderr) - - male_samples = set() - for line in args.famfile: - if line.isspace(): - continue - fields = line.rstrip().split("\t") - if fields[4] == '1': - male_samples.add(fields[1]) - print("identified {} male samples".format(len(male_samples)), file=sys.stderr) - - if args.sexchr_revise.endswith(".gz"): - sexchr_revise = {line.rstrip() for line in gzip.open(args.sexchr_revise, 'rt')} - else: - sexchr_revise = {line.rstrip() for line in open(args.sexchr_revise, 'rt')} - print("{} sites to revise on sex chromosomes".format(len(sexchr_revise)), file=sys.stderr) - - if args.multi_geno_ids_txt.endswith(".gz"): - multi_geno_ids = {line.rstrip() for line in gzip.open(args.multi_geno_ids_txt, 'rt')} - else: - multi_geno_ids = {line.rstrip() for line in open(args.multi_geno_ids_txt, 'rt')} - print("{} multiallelic sites".format(len(multi_geno_ids)), file=sys.stderr) - - NEW_HEADER_LINES = ['##ALT=', - '##FORMAT=', - '##FORMAT=', - '##INFO=', - '##FILTER='] - - with pysam.VariantFile(args.normal_revise_vcf) as normal_vcf: - - # # Add metadata lines for annotations - header1 = normal_vcf.header - - for f in NEW_HEADER_LINES: - header1.add_line(f) - header2.add_line(f) - - non_outlier_samples = {s for s in header1.samples if s not in outlier_samples} - vf_1 = max(len(non_outlier_samples) * 0.01, 2) - - biallelic_gts = {(1, 1), (0, 0), (0, 1), (None, None)} - - print("reformatting records", file=sys.stderr) - cleangq_filename = args.out_prefix + ".cleanGQ.vcf.gz" - multiallelic_filename = args.out_prefix + ".multiallelic.vcf.gz" - no_variant_samples_list_file = args.out_prefix + ".no_called_samples.list" - - with pysam.VariantFile(cleangq_filename, 'w', header=normal_vcf.header, threads=args.threads_per_file) as cleanqg_out, \ - pysam.VariantFile(multiallelic_filename, 'w', header=normal_vcf.header) as multiallelic_out, \ - open(no_variant_samples_list_file, 'w') as no_variant_samples_out: - for idx, record in enumerate(normal_vcf): - multi_del = False - multi_dup = False - gt4_copystate = False - gt5kb_dup = False - gt5kb_del = False - if (idx - 1) % 1000 == 0: - print("processed {} records".format(idx), file=sys.stderr) - if record.id in revised_lines_by_id: - record = revised_lines_by_id[record.id] - if record.info.get('SVTYPE', None) == 'DEL': - if abs(record.stop - record.pos) >= 1000: - sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples} - if len([s for s in sample_cn_map if (sample_cn_map[s] is not None and sample_cn_map[s] > 3)]) > vf_1: - multi_del = True - gts = [record.samples[s]['GT'] for s in non_outlier_samples] - if any(gt not in biallelic_gts for gt in gts): - gt5kb_del = True - if abs(record.stop - record.pos) >= 5000: - if not multi_del: - gt5kb_del = True - - if record.info.get('SVTYPE', None) == 'DUP': - if abs(record.stop - record.pos) >= 1000: - sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples} - if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and sample_cn_map[s] > 4) > vf_1: - multi_dup = True - if sum(1 for x in Counter(sample_cn_map.values()) if x is not None and (x < 1 or x > 4)) > 4: - gt4_copystate = True - if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and - (sample_cn_map[s] < 1 or sample_cn_map[s] > 4) and gt4_copystate) > vf_1: - multi_dup = True - gts = [record.samples[s]['GT'] for s in non_outlier_samples] - if any(gt not in biallelic_gts for gt in gts): - gt5kb_dup = True - if abs(record.stop - record.pos) >= 5000: - if not multi_dup: - gt5kb_dup = True - - if gt5kb_del: - for sample_obj in record.samples.itervalues(): - # Leave no-calls - if sample_obj['GT'] == (None, None): - continue - if not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] >= 2): - sample_obj['GT'] = (0, 0) - elif not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 1): - sample_obj['GT'] = (0, 1) - elif not sample_obj['GQ'] is None: - sample_obj['GT'] = (1, 1) # RD_CN 0 DEL - - if gt5kb_dup: - for sample_obj in record.samples.itervalues(): - # Leave no-calls - also causes bug that skips multiallelic genotypes for a biallelic variant - if sample_obj['GT'] == (None, None): - continue - if not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] <= 2): - sample_obj['GT'] = (0, 0) - elif not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 3): - sample_obj['GT'] = (0, 1) - elif not sample_obj['GQ'] is None: - sample_obj['GT'] = (1, 1) # RD_CN > 3 DUP - - if record.id in multi_geno_ids: - record.info['PESR_GT_OVERDISPERSION'] = True - - if multi_del or multi_dup: - record.filter.add('MULTIALLELIC') - for j, sample in enumerate(record.samples): - record.samples[sample]['GT'] = None - record.samples[sample]['GQ'] = None - record.samples[sample]['CN'] = record.samples[sample]['RD_CN'] - record.samples[sample]['CNQ'] = record.samples[sample]['RD_GQ'] - - if len(record.filter) > 1 and 'PASS' in record.filter: - del record.filter['PASS'] - - if 'MULTIALLELIC' in record.filter and ('' in record.alts or '' in record.alts): - record.alts = ('',) - record.info['SVTYPE'] = 'CNV' - - if record.id in sexchr_revise: - for sample in record.samples: - if sample in male_samples: - cn = record.samples[sample]['RD_CN'] - if cn is not None and int(cn) > 0: - cn = int(cn) - record.samples[sample]['RD_CN'] = cn - 1 - if 'CN' in record.samples[sample]: - record.samples[sample]['CN'] = cn - 1 # the old script didn't do this but I think it should - - cleanqg_out.write(record) - - if 'MULTIALLELIC' in record.filter: - multiallelic_out.write(record) - - if len(svu.get_called_samples(record)) == 0: - print(record.id, file=no_variant_samples_out) - - print("done", file=sys.stderr) - - -if __name__ == '__main__': - main() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py new file mode 100644 index 000000000..d04e0a781 --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py @@ -0,0 +1,101 @@ +#!/bin/python + +import argparse +import pysam +import gzip + + +VAR_GQ = 'varGQ' +MULTIALLELIC = 'MULTIALLELIC' +UNRESOLVED = 'UNRESOLVED' +HIGH_SR_BACKGROUND = 'HIGH_SR_BACKGROUND' +BOTHSIDES_SUPPORT = 'BOTHSIDES_SUPPORT' +REVISED_EVENT = 'REVISED_EVENT' +EV_VALUES = ['SR', 'PE', 'SR,PE', 'RD', 'BAF', 'RD,BAF'] + + +def read_last_column(file_path): + result_set = set() + with open(file_path, 'r') as f: + for line in f: + if line.strip(): + columns = line.strip().split() + result_set.add(columns[-1]) + return result_set + + +def process_record(record, fail_set, pass_set): + record = process_varGQ(record) + record = process_multiallelic(record) + record = process_unresolved(record) + record = process_noisy(record, fail_set) + record = process_bothsides_support(record, pass_set) + return record + + +def process_varGQ(record): + if VAR_GQ in record.info: + var_gq = record.info[VAR_GQ] + if isinstance(var_gq, list): + var_gq = var_gq[0] + del record.info[VAR_GQ] + record.qual = var_gq + return record + + +def process_multiallelic(record): + if MULTIALLELIC in record.info: + del record.info[MULTIALLELIC] + return record + + +def process_unresolved(record): + if UNRESOLVED in record.info: + del record.info[UNRESOLVED] + record.filter.add(UNRESOLVED) + return record + + +def process_noisy(record, fail_set): + if record.id in fail_set: + record.info[HIGH_SR_BACKGROUND] = True + return record + + +def process_bothsides_support(record, pass_set): + if record.id in pass_set: + record.info[BOTHSIDES_SUPPORT] = True + return record + + +if __name__ == '__main__': + # Parse arguments + parser = argparse.ArgumentParser(description='CleanVcf preprocessing.') + parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file') + parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file') + parser.add_argument('--fail-list', required=True, help='File with variants failing the background test') + parser.add_argument('--pass-list', required=True, help='File with variants passing both sides') + args = parser.parse_args() + + # Read input files + fail_set = read_last_column(args.fail_list) + pass_set = read_last_column(args.pass_list) + if args.input_vcf.endswith('.gz'): + vcf_in = pysam.VariantFile(gzip.open(args.input_vcf, 'rt')) + else: + vcf_in = pysam.VariantFile(args.input_vcf) + + # Open output file + if args.output_vcf.endswith('.gz'): + vcf_out = pysam.VariantFile(args.output_vcf, 'wz', header=vcf_in.header) + else: + vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=vcf_in.header.copy()) + + # Process records + for record in vcf_in: + record = process_record(record, fail_set, pass_set) + vcf_out.write(record) + + # Close files + vcf_in.close() + vcf_out.close() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py b/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py index b7d611d41..69c7d16b6 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py @@ -61,7 +61,10 @@ def main(): if args.fout in '- stdout'.split(): fout = sys.stdout else: - fout = open(args.fout, 'w') + if args.fout.endswith(".gz"): + fout = gzip.open(args.fout, 'wt') + else: + fout = open(args.fout, 'w') while True: line = vcf.readline().rstrip() diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java deleted file mode 100644 index 5637154bb..000000000 --- a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java +++ /dev/null @@ -1,316 +0,0 @@ -package org.broadinstitute.svpipeline; - -import java.io.*; -import java.nio.charset.StandardCharsets; -import java.util.*; -import java.util.regex.Pattern; -import org.broadinstitute.svpipeline.VCFParser.*; - -public class CleanVCFPart1 { - private static final ByteSequence[] EV_VALS = { - null, - new ByteSequence("RD"), - new ByteSequence("PE"), - new ByteSequence("RD,PE"), - new ByteSequence("SR"), - new ByteSequence("RD,SR"), - new ByteSequence("PE,SR"), - new ByteSequence("RD,PE,SR") - }; - private static final ByteSequence FORMAT_LINE = new ByteSequence("FORMAT"); - private static final ByteSequence ID_KEY = new ByteSequence("ID"); - private static final ByteSequence EV_VALUE = new ByteSequence("EV"); - private static final ByteSequence TYPE_KEY = new ByteSequence("Type"); - private static final ByteSequence STRING_VALUE = new ByteSequence("String"); - private static final ByteSequence NUMBER_KEY = new ByteSequence("Number"); - private static final ByteSequence SVTYPE_KEY = new ByteSequence("SVTYPE"); - private static final ByteSequence ME_VALUE = new ByteSequence(":ME"); - private static final ByteSequence LT_VALUE = new ByteSequence("<"); - private static final ByteSequence GT_VALUE = new ByteSequence(">"); - private static final ByteSequence N_VALUE = new ByteSequence("N"); - private static final ByteSequence END_KEY = new ByteSequence("END"); - private static final ByteSequence VARGQ_KEY = new ByteSequence("varGQ"); - private static final ByteSequence MULTIALLELIC_KEY = new ByteSequence("MULTIALLELIC"); - private static final ByteSequence UNRESOLVED_KEY = new ByteSequence("UNRESOLVED"); - private static final ByteSequence HIGH_SR_BACKGROUND_KEY = new ByteSequence("HIGH_SR_BACKGROUND"); - private static final ByteSequence BOTHSIDES_SUPPORT_KEY = new ByteSequence("BOTHSIDES_SUPPORT"); - private static final ByteSequence DEL_VALUE = new ByteSequence("DEL"); - private static final ByteSequence DUP_VALUE = new ByteSequence("DUP"); - private static final ByteSequence RDCN_VALUE = new ByteSequence("RD_CN"); - private static final ByteSequence MISSING_VALUE = new ByteSequence("."); - private static final ByteSequence MISSING_GENOTYPE = new ByteSequence("./."); - private static final ByteSequence GT_REF_REF = new ByteSequence("0/0"); - private static final ByteSequence GT_REF_ALT = new ByteSequence("0/1"); - private static final ByteSequence GT_ALT_ALT = new ByteSequence("1/1"); - - private static final int MIN_ALLOSOME_EVENT_SIZE = 5000; - - public static void main( final String[] args ) { - if ( args.length != 8 ) { - System.err.println("Usage: java org.broadinstitute.svpipeline.CleanVCFPart1 " + - "INPUTVCFFILE PEDIGREES XCHR YCHR NOISYEVENTS BOTHSIDES SAMPLESOUT REVISEDEVENTSOUT"); - System.exit(1); - } - final VCFParser parser = new VCFParser(args[0]); - final ByteSequence xChrName = new ByteSequence(args[2]); - final ByteSequence yChrName = new ByteSequence(args[3]); - final Set noisyEvents = readLastColumn(args[4]); - final Set bothsidesSupportEvents = readLastColumn(args[5]); - try ( final OutputStream os - = new BufferedOutputStream(new FileOutputStream(FileDescriptor.out)); - final OutputStream osSamples = new BufferedOutputStream(new FileOutputStream(args[6])); - final OutputStream osRevEvents = new BufferedOutputStream(new FileOutputStream(args[7])) ) { - int[] sexForSample = null; - while ( parser.hasMetadata() ) { - final Metadata metadata = parser.nextMetaData(); - if ( metadata instanceof ColumnHeaderMetadata ) { - final ColumnHeaderMetadata cols = ((ColumnHeaderMetadata)metadata); - final List colNames = cols.getValue(); - final int nCols = colNames.size(); - for ( int idx = 9; idx < nCols; ++idx ) { - colNames.get(idx).write(osSamples); - osSamples.write('\n'); - } - sexForSample = readPedFile(args[1], cols.getValue()); - os.write(("##INFO=\n") - .getBytes(StandardCharsets.UTF_8)); - os.write("##FILTER=\n" - .getBytes(StandardCharsets.UTF_8)); - os.write(("##INFO=\n") - .getBytes(StandardCharsets.UTF_8)); - } else if ( metadata instanceof KeyAttributesMetadata ) { - final KeyAttributesMetadata keyAttrs = (KeyAttributesMetadata)metadata; - if ( keyAttrs.getKey().equals(FORMAT_LINE) ) { - final List kvs = keyAttrs.getValue(); - final int nKVs = kvs.size(); - if ( nKVs > 2 ) { - final KeyValue kv0 = kvs.get(0); - final KeyValue kv1 = kvs.get(1); - final KeyValue kv2 = kvs.get(2); - if ( kv0.getKey().equals(ID_KEY) && kv0.getValue().equals(EV_VALUE) ) { - if ( kv1.getKey().equals(NUMBER_KEY) ) { - kvs.set(1, new KeyValue(NUMBER_KEY, MISSING_VALUE)); - } - if ( kv2.getKey().equals(TYPE_KEY) ) { - kvs.set(2, new KeyValue(TYPE_KEY, STRING_VALUE)); - } - } - } - } - } - metadata.write(os); - } - if ( sexForSample == null ) { - throw new RuntimeException("header line with sample names is missing."); - } - while ( parser.hasRecord() ) { - final Record record = parser.nextRecord(); - - // replace the numeric EV value with a text value - final int evIdx = record.getFormat().indexOf(EV_VALUE); - if ( evIdx >= 0 ) { - for ( final CompoundField genotypeVals : record.getGenotypes() ) { - genotypeVals.set(evIdx, EV_VALS[genotypeVals.get(evIdx).asInt()]); - } - } - - // move the SVTYPE to the ALT field (except for MEs) - final InfoField info = record.getInfo(); - final ByteSequence svType = info.get(SVTYPE_KEY); - if ( !record.getAlt().contains(ME_VALUE) ) { - if ( svType != null ) { - record.setAlt(new ByteSequence(LT_VALUE, svType, GT_VALUE)); - } - } - record.setRef(N_VALUE); - - // move varGQ info field to quality column - final ByteSequence varGQ = info.get(VARGQ_KEY); - if ( varGQ != null ) { - record.setQuality(varGQ); - info.remove(VARGQ_KEY); - } - - // remove MULTIALLELIC flag, if present - info.remove(MULTIALLELIC_KEY); - - // remove UNRESOLVED flag and add it as a filter - if ( info.containsKey(UNRESOLVED_KEY) ) { - record.getFilter().add(UNRESOLVED_KEY); - info.remove(UNRESOLVED_KEY); - } - - // mark noisy events - if ( noisyEvents.contains(record.getID()) ) { - record.getInfo().put(HIGH_SR_BACKGROUND_KEY, null); - } - - // mark bothsides support - if ( bothsidesSupportEvents.contains(record.getID()) ) { - record.getInfo().put(BOTHSIDES_SUPPORT_KEY, null); - } - - // fix genotypes on allosomes - final boolean isY; - if ( (isY = yChrName.equals(record.getChromosome())) || - xChrName.equals(record.getChromosome())) { - final List genotypes = record.getGenotypes(); - final int rdCNIndex = record.getFormat().indexOf(RDCN_VALUE); - final ByteSequence end = info.get(END_KEY); - boolean adjustMale = false; - final boolean isDel; - if ( ((isDel = DEL_VALUE.equals(svType)) || DUP_VALUE.equals(svType)) && rdCNIndex >= 0 && end != null && - end.asInt() + 1 - record.getPosition() > MIN_ALLOSOME_EVENT_SIZE ) { - adjustMale = isRevisableEvent(genotypes, rdCNIndex, sexForSample, isY); - if ( adjustMale ) { - record.getID().write(osRevEvents); - osRevEvents.write('\n'); - } - } - CompoundField emptyGenotype = null; - final int nSamples = genotypes.size(); - for ( int sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) { - final int sampleSex = sexForSample[sampleIdx]; - final CompoundField genotype = genotypes.get(sampleIdx); - if ( sampleSex == 1 ) { - if ( adjustMale ) { - final ByteSequence rdCN = genotype.get(rdCNIndex); - if ( rdCN.equals(MISSING_VALUE) ) { - continue; - } - final int rdCNVal = rdCN.asInt(); - genotype.set(rdCNIndex, new ByteSequence(Integer.toString(rdCNVal + 1))); - if ( isDel ) { - if ( rdCNVal >= 1 ) genotype.set(0, GT_REF_REF); - else if ( rdCNVal == 0 ) genotype.set(0, GT_REF_ALT); - } else { - if ( rdCNVal <= 1 ) genotype.set(0, GT_REF_REF); - else if ( rdCNVal == 2 ) genotype.set(0, GT_REF_ALT); - else genotype.set(0, GT_ALT_ALT); - } - } - } else if ( sampleSex == 2 ) { - if ( isY ) { - if ( emptyGenotype == null ) { - emptyGenotype = new CompoundField(MISSING_GENOTYPE, ':'); - int nFields = genotype.size(); - while ( --nFields > 0 ) { - emptyGenotype.add(MISSING_VALUE); - } - emptyGenotype.getValue(); // performance hack to put the pieces together - } - genotypes.set(sampleIdx, emptyGenotype); - } - } else { - genotype.set(0, MISSING_GENOTYPE); - } - } - } - - record.write(os); - } - } catch ( final IOException ioe ) { - throw new RuntimeException("Can't write to stdout", ioe); - } - } - - private static boolean isRevisableEvent( final List genotypes, - final int rdCNIndex, - final int[] sexForColumn, - final boolean isY ) { - // We're going to calculate the median rdCN values for males and females. - // We only care if the median is 0, 1, 2, or something larger, so we'll use 4 bins to - // sum up the counts: all values >2 go into the last bucket. - final int[] maleCounts = new int[4]; - final int[] femaleCounts = new int[4]; - final int nSamples = genotypes.size(); - for ( int sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) { - final ByteSequence rdCN = genotypes.get(sampleIdx).get(rdCNIndex); - if ( MISSING_VALUE.equals(rdCN) ) { - continue; - } - int rdCNVal = rdCN.asInt(); - if ( rdCNVal > 2 ) { - rdCNVal = 3; - } - final int sampleSex = sexForColumn[sampleIdx]; - if ( sampleSex == 1 ) { - maleCounts[rdCNVal] += 1; - } else if ( sampleSex == 2 ) { - femaleCounts[rdCNVal] += 1; - } - } - final double maleMedian = calcMedian(maleCounts); - double femaleMedian = calcMedian(femaleCounts); - return maleMedian == 1. && (isY ? femaleMedian == 0. : femaleMedian == 2.); - } - - // visible for testing - static double calcMedian( final int[] counts ) { - final double target = (counts[0] + counts[1] + counts[2] + counts[3]) / 2.; - if ( target == 0. ) { - return Double.NaN; - } - int total = 0; - for ( int iii = 0; iii < 4; ++iii ) { - total += counts[iii]; - if ( total == target ) { - return iii + .5; - } else if ( total > target ) { - return (double)iii; - } - } - throw new IllegalStateException("we should never reach this statement"); - } - - private static Set readLastColumn( final String filename ) { - final Set values = new HashSet<>(); - try { - final BufferedReader neRdr = - new BufferedReader(new InputStreamReader(new FileInputStream(filename))); - String line; - while ( (line = neRdr.readLine()) != null ) { - final String lastCol = line.substring(line.lastIndexOf('\t') + 1); - values.add(new ByteSequence(lastCol)); - } - } catch ( final IOException ioe ) { - throw new RuntimeException("can't read table file " + filename); - } - return values; - } - - private static int[] readPedFile( final String pedFilename, List sampleNames ) { - final int nCols = sampleNames.size() - 9; - final Map sexForSampleMap = new HashMap<>(2*nCols); - final int[] sexForSample = new int[nCols]; - try { - final BufferedReader pedRdr = - new BufferedReader(new InputStreamReader(new FileInputStream(pedFilename))); - final Pattern tabPattern = Pattern.compile("\\t"); - String line; - while ( (line = pedRdr.readLine()) != null ) { - if ( line.startsWith("#") ) continue; - final Scanner scanner = new Scanner(line).useDelimiter(tabPattern); - scanner.next(); // family ignored - final String sampleName = scanner.next(); - scanner.next(); // mom ignored - scanner.next(); // pop ignored - final int sex = scanner.nextInt(); - sexForSampleMap.put(new ByteSequence(sampleName), sex); - } - } catch ( final IOException ioe ) { - throw new RuntimeException("can't read " + pedFilename, ioe); - } - for ( int col = 0; col < nCols; ++col ) { - final ByteSequence sampleName = sampleNames.get(col + 9); - final Integer sex = sexForSampleMap.get(sampleName); - if ( sex == null ) { - throw new RuntimeException("can't determine sex for sample " + sampleName); - } - sexForSample[col] = sex; - } - return sexForSample; - } -} diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java deleted file mode 100644 index 77a6b5658..000000000 --- a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java +++ /dev/null @@ -1,40 +0,0 @@ -package org.broadinstitute.svpipeline; - -public class CleanVCFPart1UnitTest { - public static void main( final String[] args ) { - testAsserts(); - testMedianCalculation(); - System.out.println("OK"); - } - - public static void testAsserts() { - boolean caughtIt = false; - try { - assert(false); - } catch ( final AssertionError ae ) { - caughtIt = true; - } - if ( !caughtIt ) { - throw new AssertionError("assertions aren't turned on, so you're not testing anything."); - } - } - - public static void testMedianCalculation() { - final int[] counts = new int[4]; - assert(Double.isNaN(CleanVCFPart1.calcMedian(counts))); - counts[0] = 1; - assert(CleanVCFPart1.calcMedian(counts) == 0.0); - counts[1] = 1; - assert(CleanVCFPart1.calcMedian(counts) == 0.5); - counts[2] = 1; - assert(CleanVCFPart1.calcMedian(counts) == 1.0); - counts[3] = 1; - assert(CleanVCFPart1.calcMedian(counts) == 1.5); - counts[2] = 2; - assert(CleanVCFPart1.calcMedian(counts) == 2.0); - counts[3] = 4; - assert(CleanVCFPart1.calcMedian(counts) == 2.5); - counts[3] = 5; - assert(CleanVCFPart1.calcMedian(counts) == 3.0); - } -} diff --git a/wdl/CalcAF.wdl b/wdl/CalcAF.wdl index cbc124e2a..064c3b28a 100644 --- a/wdl/CalcAF.wdl +++ b/wdl/CalcAF.wdl @@ -1,7 +1,6 @@ version 1.0 import "Structs.wdl" -import "CleanVcf5.wdl" as cleanvcf5 import "TasksMakeCohortVcf.wdl" as tmc workflow CalcAF { diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index ab228ccf7..42974547b 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -21,8 +21,6 @@ workflow CleanVcf { Int min_records_per_shard_step1 Int samples_per_step2_shard Int? max_samples_per_shard_step3 - Int clean_vcf1b_records_per_shard - Int clean_vcf5_records_per_shard File HERVK_reference File LINE1_reference @@ -47,6 +45,7 @@ workflow CleanVcf { File? resolve_complex_merged_vcf File? genotype_complex_merged_vcf + String gatk_docker String linux_docker String sv_base_mini_docker String sv_pipeline_docker @@ -60,28 +59,16 @@ workflow CleanVcf { # overrides for CleanVcfContig RuntimeAttr? runtime_override_clean_vcf_1a + RuntimeAttr? runtime_override_clean_vcf_1b RuntimeAttr? runtime_override_clean_vcf_2 RuntimeAttr? runtime_override_clean_vcf_3 RuntimeAttr? runtime_override_clean_vcf_4 - RuntimeAttr? runtime_override_clean_vcf_5_scatter - RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq - RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics - RuntimeAttr? runtime_override_clean_vcf_5_polish + RuntimeAttr? runtime_override_clean_vcf_5 RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup RuntimeAttr? runtime_attr_format RuntimeAttr? runtime_override_rescue_me_dels - # Clean vcf 1b - RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b - RuntimeAttr? runtime_attr_override_sort_bed_1b - RuntimeAttr? runtime_attr_override_intersect_bed_1b - RuntimeAttr? runtime_attr_override_build_dict_1b - RuntimeAttr? runtime_attr_override_scatter_1b - RuntimeAttr? runtime_attr_override_filter_vcf_1b - RuntimeAttr? runtime_override_concat_vcfs_1b - RuntimeAttr? runtime_override_cat_multi_cnvs_1b - RuntimeAttr? runtime_override_preconcat_step1 RuntimeAttr? runtime_override_hail_merge_step1 RuntimeAttr? runtime_override_fix_header_step1 @@ -91,11 +78,8 @@ workflow CleanVcf { RuntimeAttr? runtime_override_fix_header_drc RuntimeAttr? runtime_override_split_vcf_to_clean - RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions RuntimeAttr? runtime_override_split_include_list RuntimeAttr? runtime_override_combine_clean_vcf_2 - RuntimeAttr? runtime_override_combine_revised_4 - RuntimeAttr? runtime_override_combine_multi_ids_4 RuntimeAttr? runtime_override_drop_redundant_cnvs RuntimeAttr? runtime_override_combine_step_1_vcfs RuntimeAttr? runtime_override_sort_drop_redundant_cnvs @@ -134,32 +118,26 @@ workflow CleanVcf { outlier_samples_list=outlier_samples_list, use_hail=use_hail, gcs_project=gcs_project, - clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard, - clean_vcf5_records_per_shard=clean_vcf5_records_per_shard, ploidy_table=CreatePloidyTableFromPed.out, HERVK_reference=HERVK_reference, LINE1_reference=LINE1_reference, chr_x=chr_x, chr_y=chr_y, + gatk_docker=gatk_docker, linux_docker=linux_docker, sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, runtime_override_clean_vcf_1a=runtime_override_clean_vcf_1a, + runtime_override_clean_vcf_1b=runtime_override_clean_vcf_1b, runtime_override_clean_vcf_2=runtime_override_clean_vcf_2, runtime_override_clean_vcf_3=runtime_override_clean_vcf_3, runtime_override_clean_vcf_4=runtime_override_clean_vcf_4, - runtime_override_clean_vcf_5_scatter=runtime_override_clean_vcf_5_scatter, - runtime_override_clean_vcf_5_make_cleangq=runtime_override_clean_vcf_5_make_cleangq, - runtime_override_clean_vcf_5_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics, - runtime_override_clean_vcf_5_polish=runtime_override_clean_vcf_5_polish, + runtime_override_clean_vcf_5=runtime_override_clean_vcf_5, runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs, runtime_override_final_cleanup=runtime_override_final_cleanup, runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean, - runtime_override_combine_step_1_sex_chr_revisions=runtime_override_combine_step_1_sex_chr_revisions, runtime_override_split_include_list=runtime_override_split_include_list, runtime_override_combine_clean_vcf_2=runtime_override_combine_clean_vcf_2, - runtime_override_combine_revised_4=runtime_override_combine_revised_4, - runtime_override_combine_multi_ids_4=runtime_override_combine_multi_ids_4, runtime_override_preconcat_step1=runtime_override_preconcat_step1, runtime_override_hail_merge_step1=runtime_override_hail_merge_step1, runtime_override_fix_header_step1=runtime_override_fix_header_step1, @@ -167,14 +145,8 @@ workflow CleanVcf { runtime_override_hail_merge_drc=runtime_override_hail_merge_drc, runtime_override_fix_header_drc=runtime_override_fix_header_drc, runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs, - runtime_attr_override_subset_large_cnvs_1b=runtime_attr_override_subset_large_cnvs_1b, - runtime_attr_override_sort_bed_1b=runtime_attr_override_sort_bed_1b, - runtime_attr_override_intersect_bed_1b=runtime_attr_override_intersect_bed_1b, - runtime_attr_override_build_dict_1b=runtime_attr_override_build_dict_1b, - runtime_attr_override_scatter_1b=runtime_attr_override_scatter_1b, - runtime_attr_override_filter_vcf_1b=runtime_attr_override_filter_vcf_1b, - runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b, - runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b, + runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs, + runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs, runtime_attr_format=runtime_attr_format, runtime_override_rescue_me_dels=runtime_override_rescue_me_dels } diff --git a/wdl/CleanVcf1b.wdl b/wdl/CleanVcf1b.wdl deleted file mode 100644 index 691d0591c..000000000 --- a/wdl/CleanVcf1b.wdl +++ /dev/null @@ -1,353 +0,0 @@ -version 1.0 - -import "Structs.wdl" -import "CleanVcf5.wdl" as CleanVcf5 -import "TasksMakeCohortVcf.wdl" as MiniTasks - -workflow CleanVcf1b { - input { - File intermediate_vcf - String prefix - Int records_per_shard - - String sv_pipeline_docker - String sv_base_mini_docker - - RuntimeAttr? runtime_attr_override_subset_large_cnvs - RuntimeAttr? runtime_attr_override_sort_bed - RuntimeAttr? runtime_attr_override_intersect_bed - RuntimeAttr? runtime_attr_override_build_dict - RuntimeAttr? runtime_attr_override_scatter - RuntimeAttr? runtime_attr_override_filter_vcf - RuntimeAttr? runtime_override_concat_vcfs - RuntimeAttr? runtime_override_cat_multi_cnvs - } - - call SubsetLargeCNVs { - input: - vcf=intermediate_vcf, - prefix="~{prefix}.subset_large_cnvs", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_subset_large_cnvs - } - - call Vcf2Bed { - input: - vcf=SubsetLargeCNVs.out, - prefix="~{prefix}.subset_large_cnvs", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_subset_large_cnvs - } - - call SortBed { - input: - bed=Vcf2Bed.out, - prefix="~{prefix}.subset_large_cnvs.sorted", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_attr_override_sort_bed - } - - call BedtoolsIntersect { - input: - bed=SortBed.out, - prefix="~{prefix}.bedtools_intersect", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_attr_override_intersect_bed - } - - call BuildGenoNormalReviseDictionary { - input: - filtered_vcf=SubsetLargeCNVs.out, - intersect_bed=BedtoolsIntersect.out, - prefix="~{prefix}.geno_normal_revise", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_build_dict - } - - call MiniTasks.ScatterVcf { - input: - vcf=intermediate_vcf, - records_per_shard=records_per_shard, - prefix="~{prefix}.scatter_vcf", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_scatter - } - - scatter ( i in range(length(ScatterVcf.shards)) ) { - call FilterVcf { - input: - intermediate_vcf=ScatterVcf.shards[i], - dictionary_json_gz=BuildGenoNormalReviseDictionary.out, - prefix="~{prefix}.filter_vcf.shard_~{i}", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_filter_vcf - } - } - - call MiniTasks.ConcatVcfs as ConcatCleanVcf1bShards { - input: - vcfs=FilterVcf.out, - naive=true, - sort_vcf_list=true, - outfile_prefix="~{prefix}.concat_vcfs", - sv_base_mini_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_concat_vcfs - } - - call MiniTasks.CatUncompressedFiles as ConcatMultiCnvs { - input: - shards=FilterVcf.multi_cnvs, - outfile_name="~{prefix}.multi.cnvs.txt", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_cat_multi_cnvs - } - - output { - File normal = ConcatCleanVcf1bShards.concat_vcf - File multi = ConcatMultiCnvs.outfile - } -} - -task SubsetLargeCNVs { - input { - File vcf - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - bcftools view --no-version \ - -i '(INFO/SVTYPE=="DEL" || INFO/SVTYPE=="DUP") && INFO/SVLEN>=5000' \ - ~{vcf} \ - | bgzip \ - > ~{prefix}.vcf.gz - >>> - output { - File out = "~{prefix}.vcf.gz" - } -} - -task Vcf2Bed { - input { - File vcf - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - svtk vcf2bed --no-header ~{vcf} stdout \ - | awk -F'\t' -v OFS='\t' '{if ($6=="") $6="blanksample";print $0}' \ - | gzip -1 \ - > ~{prefix}.bed.gz - >>> - output { - File out = "~{prefix}.bed.gz" - } -} - -task SortBed { - input { - File bed - String prefix - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(bed, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 10.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - mkdir tmp - zcat ~{bed} \ - | sort -T tmp -k1,1 -k2,2n \ - | gzip -1 \ - > ~{prefix}.bed.gz - >>> - output { - File out = "~{prefix}.bed.gz" - } -} - -task BedtoolsIntersect { - input { - File bed - String prefix - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(bed, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 10.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - bedtools intersect -sorted -wa -wb -a <(zcat ~{bed}) -b <(zcat ~{bed}) \ - | awk -F'\t' -v OFS='\t' '$4!=$10 && $5!=$11' \ - | gzip -1 \ - > ~{prefix}.bed.gz - >>> - output { - File out = "~{prefix}.bed.gz" - } -} - -task BuildGenoNormalReviseDictionary { - input { - File filtered_vcf - File intersect_bed - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([filtered_vcf, intersect_bed], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py ~{filtered_vcf} ~{intersect_bed} \ - | gzip -1 \ - > ~{prefix}.json.gz - >>> - output { - File out = "~{prefix}.json.gz" - } -} - -task FilterVcf { - input { - File intermediate_vcf - File dictionary_json_gz - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([intermediate_vcf, dictionary_json_gz], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py ~{dictionary_json_gz} ~{intermediate_vcf} \ - | bgzip \ - > ~{prefix}.vcf.gz - mv multi.cnvs.txt ~{prefix}.multi.cnvs.txt - >>> - output { - File out = "~{prefix}.vcf.gz" - File multi_cnvs = "~{prefix}.multi.cnvs.txt" - } -} diff --git a/wdl/CleanVcf5.wdl b/wdl/CleanVcf5.wdl deleted file mode 100644 index f9be79029..000000000 --- a/wdl/CleanVcf5.wdl +++ /dev/null @@ -1,273 +0,0 @@ -version 1.0 - -import "Structs.wdl" -import "TasksMakeCohortVcf.wdl" as tasks - -workflow CleanVcf5 { - input { - File normal_revise_vcf - File revise_vcf_lines - File ped_file - File sex_chr_revise - File multi_ids - File? outlier_samples_list - - String prefix - String contig - Int records_per_shard - - File? make_clean_gq_script - File? find_redundant_sites_script - - String sv_base_mini_docker - String sv_pipeline_docker - - Int? threads_per_task - RuntimeAttr? runtime_attr_override_scatter - RuntimeAttr? runtime_attr_override_make_cleangq - RuntimeAttr? runtime_attr_override_find_redundant_multiallelics - RuntimeAttr? runtime_attr_override_polish - } - - call tasks.ScatterVcf { - input: - vcf=normal_revise_vcf, - records_per_shard = records_per_shard, - prefix = "~{prefix}.scatter_vcf", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_scatter - } - - scatter ( i in range(length(ScatterVcf.shards)) ) { - call MakeCleanGQ { - input: - revise_vcf_lines=revise_vcf_lines, - normal_revise_vcf=ScatterVcf.shards[i], - ped_file=ped_file, - sex_chr_revise=sex_chr_revise, - multi_ids=multi_ids, - outlier_samples_list=outlier_samples_list, - make_clean_gq_script=make_clean_gq_script, - prefix="~{prefix}.make_clean_gq.shard_~{i}", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_make_cleangq - } - } - - call FindRedundantMultiallelics { - input: - multiallelic_vcfs=MakeCleanGQ.multiallelic_vcf, - find_redundant_sites_script=find_redundant_sites_script, - prefix="~{prefix}.find_redundant_multiallelics", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_find_redundant_multiallelics - } - - call Polish { - input: - clean_gq_vcfs=MakeCleanGQ.clean_gq_vcf, - no_sample_lists=MakeCleanGQ.no_sample_list, - redundant_multiallelics_list=FindRedundantMultiallelics.redundant_multiallelics_list, - prefix="~{prefix}.polish", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_polish - } - - output { - File polished=Polish.polished - } -} - -task MakeCleanGQ { - input { - File revise_vcf_lines - File normal_revise_vcf - File ped_file - File sex_chr_revise - File multi_ids - File? outlier_samples_list - File? make_clean_gq_script - String prefix - Int? threads = 2 - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size( - select_all([revise_vcf_lines, normal_revise_vcf, ped_file, sex_chr_revise, multi_ids, outlier_samples_list]), - "GB") - Float base_disk_gb = 10.0 - - RuntimeAttr runtime_default = object { - mem_gb: 16, - disk_gb: ceil(base_disk_gb + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -eu -o pipefail - - ~{if defined(outlier_samples_list) then "ln ~{outlier_samples_list} outliers.txt" else "touch outliers.txt"} - - # put the revise lines into a normal VCF format - bcftools view -h ~{normal_revise_vcf} > header.txt - cat header.txt <(zcat ~{revise_vcf_lines} | grep . | tr " " "\t") | bgzip -c > revise.vcf.lines.vcf.gz - - python3 ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py" make_clean_gq_script} \ - --threads_per_file ~{threads} \ - revise.vcf.lines.vcf.gz \ - ~{normal_revise_vcf} \ - ~{ped_file} \ - ~{sex_chr_revise} \ - ~{multi_ids} \ - outliers.txt \ - ~{prefix} - - bcftools view -G -O z ~{prefix}.multiallelic.vcf.gz > ~{prefix}.multiallelic.sites.vcf.gz - tabix ~{prefix}.cleanGQ.vcf.gz - >>> - - output { - File clean_gq_vcf=prefix + ".cleanGQ.vcf.gz" - File clean_gq_vcf_idx=prefix + ".cleanGQ.vcf.gz.tbi" - File multiallelic_vcf=prefix + ".multiallelic.sites.vcf.gz" - File no_sample_list = prefix + ".no_called_samples.list" - } -} - -task FindRedundantMultiallelics { - input { - Array[File] multiallelic_vcfs - File? find_redundant_sites_script - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size(multiallelic_vcfs, "GB") - Float base_disk_gb = 10.0 - - RuntimeAttr runtime_default = object { - mem_gb: 16, - disk_gb: ceil(base_disk_gb + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - VCFS="~{write_lines(multiallelic_vcfs)}" - cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs_sorted.list - bcftools concat --no-version --output-type z --file-list vcfs_sorted.list --output multiallelic.vcf.gz - - python3 ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py" find_redundant_sites_script} \ - multiallelic.vcf.gz \ - ~{prefix}.list - - >>> - - output { - File redundant_multiallelics_list="~{prefix}.list" - } -} - - -task Polish { - input { - Array[File] clean_gq_vcfs - Array[File] no_sample_lists - File redundant_multiallelics_list - String prefix - String sv_pipeline_docker - Int threads = 2 - RuntimeAttr? runtime_attr_override - } - - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size(clean_gq_vcfs, "GB") - Float base_disk_gb = 10.0 - - RuntimeAttr runtime_default = object { - mem_gb: 16, - disk_gb: ceil(base_disk_gb + input_size * 5.0), - cpu_cores: 4, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - - VCFS="~{write_lines(clean_gq_vcfs)}" - cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs_sorted.list - cat ~{redundant_multiallelics_list} ~{sep=" " no_sample_lists} > ids_to_remove.list - bcftools concat --no-version --output-type u --file-list vcfs_sorted.list | \ - bcftools view --no-version \ - --exclude 'ID=@ids_to_remove.list' \ - --output-type z -o polished.need_reheader.vcf.gz --threads ~{threads} - - # replace multiallelic genotypes for CNVs with homref - bcftools +setGT polished.need_reheader.vcf.gz -- \ - -t q \ - -n c:'1/1' \ - -i '(INFO/SVTYPE="DEL" | INFO/SVTYPE="DUP") & (FMT/GT~"[2-9]" | FMT/GT~"[1-9][0-9]+") & FMT/RD_CN>3' > polished.need_reheader.regenotyped.vcf - - bgzip polished.need_reheader.regenotyped.vcf - - # do the last bit of header cleanup - bcftools view -h polished.need_reheader.regenotyped.vcf.gz > original_header.vcf - cat original_header.vcf | fgrep '##fileformat' > new_header.vcf - cat original_header.vcf \ - | egrep -v "CIPOS|CIEND|RMSSTD|EVENT|INFO=> new_header.vcf - # Don't sort contigs lexicographically, which would result in incorrect chr1, chr10, chr11, ... ordering - cat original_header.vcf | fgrep '##contig' >> new_header.vcf - cat original_header.vcf | fgrep '#CHROM' >> new_header.vcf - bcftools reheader polished.need_reheader.regenotyped.vcf.gz -h new_header.vcf -o ~{prefix}.vcf.gz - >>> - - output { - File polished="~{prefix}.vcf.gz" - } -} diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 5800be909..69a1e068e 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -3,294 +3,182 @@ version 1.0 import "Structs.wdl" import "TasksMakeCohortVcf.wdl" as MiniTasks import "FormatVcfForGatk.wdl" as fvcf -import "CleanVcf1b.wdl" as c1b -import "CleanVcf5.wdl" as c5 import "HailMerge.wdl" as HailMerge workflow CleanVcfChromosome { - input { - File vcf - String contig - File background_list - File ped_file - File allosome_fai - String prefix - Int max_shards_per_chrom_step1 - File bothsides_pass_list - Int min_records_per_shard_step1 - Int samples_per_step2_shard - Int clean_vcf1b_records_per_shard - Int clean_vcf5_records_per_shard - Int? clean_vcf5_threads_per_task - File? outlier_samples_list - Int? max_samples_per_shard_step3 - - File HERVK_reference - File LINE1_reference - - File ploidy_table - String chr_x - String chr_y - - File? svtk_to_gatk_script # For debugging - - Boolean use_hail - String? gcs_project - - String linux_docker - String sv_base_mini_docker - String sv_pipeline_docker - - # overrides for local tasks - RuntimeAttr? runtime_override_clean_vcf_1a - RuntimeAttr? runtime_override_clean_vcf_2 - RuntimeAttr? runtime_override_clean_vcf_3 - RuntimeAttr? runtime_override_clean_vcf_4 - RuntimeAttr? runtime_override_clean_vcf_5_scatter - RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq - RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics - RuntimeAttr? runtime_override_clean_vcf_5_polish - RuntimeAttr? runtime_override_stitch_fragmented_cnvs - RuntimeAttr? runtime_override_final_cleanup - RuntimeAttr? runtime_override_rescue_me_dels - RuntimeAttr? runtime_attr_add_high_fp_rate_filters - - # Clean vcf 1b - RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b - RuntimeAttr? runtime_attr_override_sort_bed_1b - RuntimeAttr? runtime_attr_override_intersect_bed_1b - RuntimeAttr? runtime_attr_override_build_dict_1b - RuntimeAttr? runtime_attr_override_scatter_1b - RuntimeAttr? runtime_attr_override_filter_vcf_1b - RuntimeAttr? runtime_override_concat_vcfs_1b - RuntimeAttr? runtime_override_cat_multi_cnvs_1b - - RuntimeAttr? runtime_override_preconcat_step1 - RuntimeAttr? runtime_override_hail_merge_step1 - RuntimeAttr? runtime_override_fix_header_step1 - - RuntimeAttr? runtime_override_preconcat_drc - RuntimeAttr? runtime_override_hail_merge_drc - RuntimeAttr? runtime_override_fix_header_drc - - # overrides for MiniTasks - RuntimeAttr? runtime_override_split_vcf_to_clean - RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions - RuntimeAttr? runtime_override_split_include_list - RuntimeAttr? runtime_override_combine_clean_vcf_2 - RuntimeAttr? runtime_override_combine_revised_4 - RuntimeAttr? runtime_override_combine_multi_ids_4 - RuntimeAttr? runtime_override_drop_redundant_cnvs - RuntimeAttr? runtime_override_combine_step_1_vcfs - RuntimeAttr? runtime_override_sort_drop_redundant_cnvs - RuntimeAttr? runtime_attr_format - - } - - call MiniTasks.SplitVcf as SplitVcfToClean { - input: - vcf=vcf, - contig=contig, - prefix="~{prefix}.shard_", - n_shards=max_shards_per_chrom_step1, - min_vars_per_shard=min_records_per_shard_step1, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_split_vcf_to_clean - } - - scatter ( i in range(length(SplitVcfToClean.vcf_shards)) ) { - call CleanVcf1a { - input: - vcf=SplitVcfToClean.vcf_shards[i], - prefix="~{prefix}.clean_vcf_1.shard_~{i}", - background_fail_list=background_list, - bothsides_pass_list=bothsides_pass_list, - ped_file=ped_file, - allosome_fai=allosome_fai, - chr_x=chr_x, - chr_y=chr_y, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_1a - } - } - - if (use_hail) { - call HailMerge.HailMerge as CombineStep1VcfsHail { - input: - vcfs=CleanVcf1a.intermediate_vcf, - prefix="~{prefix}.combine_step_1_vcfs", - gcs_project=gcs_project, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - runtime_override_preconcat=runtime_override_preconcat_step1, - runtime_override_hail_merge=runtime_override_hail_merge_step1, - runtime_override_fix_header=runtime_override_fix_header_step1 - } - } - if (!use_hail) { - call MiniTasks.ConcatVcfs as CombineStep1Vcfs { - input: - vcfs=CleanVcf1a.intermediate_vcf, - vcfs_idx=CleanVcf1a.intermediate_vcf_idx, - naive=true, - generate_index=false, - outfile_prefix="~{prefix}.combine_step_1_vcfs", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_step_1_vcfs - } - } - - call MiniTasks.CatUncompressedFiles as CombineStep1SexChrRevisions { - input: - shards=CleanVcf1a.sex, - outfile_name="~{prefix}.combine_step_1_sex_chr_revisions.txt", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_step_1_sex_chr_revisions - } - - call c1b.CleanVcf1b { - input: - intermediate_vcf=select_first([CombineStep1Vcfs.concat_vcf, CombineStep1VcfsHail.merged_vcf]), - prefix="~{prefix}.clean_vcf_1b", - records_per_shard=clean_vcf1b_records_per_shard, - sv_pipeline_docker=sv_pipeline_docker, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override_subset_large_cnvs=runtime_attr_override_subset_large_cnvs_1b, - runtime_attr_override_sort_bed=runtime_attr_override_sort_bed_1b, - runtime_attr_override_intersect_bed=runtime_attr_override_intersect_bed_1b, - runtime_attr_override_build_dict=runtime_attr_override_build_dict_1b, - runtime_attr_override_scatter=runtime_attr_override_scatter_1b, - runtime_attr_override_filter_vcf=runtime_attr_override_filter_vcf_1b, - runtime_override_concat_vcfs=runtime_override_concat_vcfs_1b, - runtime_override_cat_multi_cnvs=runtime_override_cat_multi_cnvs_1b - } - - call MiniTasks.SplitUncompressed as SplitIncludeList { - input: - whole_file=CleanVcf1a.include_list[0], - lines_per_shard=samples_per_step2_shard, - shard_prefix="~{prefix}.split_include_list.", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_split_include_list - } - - scatter ( i in range(length(SplitIncludeList.shards)) ){ - call CleanVcf2 { - input: - normal_revise_vcf=CleanVcf1b.normal, - prefix="~{prefix}.clean_vcf_2.shard_~{i}", - include_list=SplitIncludeList.shards[i], - multi_cnvs=CleanVcf1b.multi, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_2 - } - } - - call MiniTasks.CatUncompressedFiles as CombineCleanVcf2 { - input: - shards=CleanVcf2.out, - outfile_name="~{prefix}.combine_clean_vcf_2.txt", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_clean_vcf_2 - } - - call CleanVcf3 { - input: - rd_cn_revise=CombineCleanVcf2.outfile, - max_samples_shard = max_samples_per_shard_step3, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_3 - } - - scatter ( i in range(length(CleanVcf3.shards)) ){ - call CleanVcf4 { - input: - rd_cn_revise=CleanVcf3.shards[i], - normal_revise_vcf=CleanVcf1b.normal, - prefix="~{prefix}.clean_vcf_4.shard_~{i}", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_4 - } - } - - call MiniTasks.CatUncompressedFiles as CombineRevised4 { - input: - shards=CleanVcf4.out, - outfile_name="~{prefix}.combine_revised_4.txt.gz", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_revised_4 - } - - call MiniTasks.CatUncompressedFiles as CombineMultiIds4 { - input: - shards=CleanVcf4.multi_ids, - outfile_name="~{prefix}.combine_multi_ids_4.txt.gz", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_multi_ids_4 - } - - call c5.CleanVcf5 { - input: - revise_vcf_lines=CombineRevised4.outfile, - normal_revise_vcf=CleanVcf1b.normal, - ped_file=ped_file, - sex_chr_revise=CombineStep1SexChrRevisions.outfile, - multi_ids=CombineMultiIds4.outfile, - outlier_samples_list=outlier_samples_list, - contig=contig, - prefix="~{prefix}.clean_vcf_5", - records_per_shard=clean_vcf5_records_per_shard, - threads_per_task=clean_vcf5_threads_per_task, - sv_pipeline_docker=sv_pipeline_docker, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override_scatter=runtime_override_clean_vcf_5_scatter, - runtime_attr_override_make_cleangq=runtime_override_clean_vcf_5_make_cleangq, - runtime_attr_override_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics, - runtime_attr_override_polish=runtime_override_clean_vcf_5_polish - } - - call DropRedundantCnvs { - input: - vcf=CleanVcf5.polished, - prefix="~{prefix}.drop_redundant_cnvs", - contig=contig, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_drop_redundant_cnvs - } - - if (use_hail) { - call HailMerge.HailMerge as SortDropRedundantCnvsHail { - input: - vcfs=[DropRedundantCnvs.out], - prefix="~{prefix}.drop_redundant_cnvs.sorted", - gcs_project=gcs_project, - reset_cnv_gts=true, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - runtime_override_preconcat=runtime_override_preconcat_drc, - runtime_override_hail_merge=runtime_override_hail_merge_drc, - runtime_override_fix_header=runtime_override_fix_header_drc - } - } - if (!use_hail) { - call MiniTasks.SortVcf as SortDropRedundantCnvs { - input: - vcf=DropRedundantCnvs.out, - outfile_prefix="~{prefix}.drop_redundant_cnvs.sorted", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_sort_drop_redundant_cnvs - } - } - - call StitchFragmentedCnvs { - input: - vcf=select_first([SortDropRedundantCnvs.out, SortDropRedundantCnvsHail.merged_vcf]), - prefix="~{prefix}.stitch_fragmented_cnvs", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_stitch_fragmented_cnvs - } - - call RescueMobileElementDeletions { + input { + File vcf + String contig + File background_list + File bothsides_pass_list + File? outlier_samples_list + File ped_file + File allosome_fai + String prefix + + File HERVK_reference + File LINE1_reference + + File ploidy_table + String chr_x + String chr_y + + Boolean use_hail + String? gcs_project + + String gatk_docker + String linux_docker + String sv_base_mini_docker + String sv_pipeline_docker + + # overrides for local tasks + RuntimeAttr? runtime_attr_preprocess + RuntimeAttr? runtime_attr_revise_overlapping_cnv_gts + RuntimeAttr? runtime_attr_revise_overlapping_cnv_cns + RuntimeAttr? runtime_attr_revise_large_cnvs + RuntimeAttr? runtime_attr_revise_abnormal_allosomes + RuntimeAttr? runtime_attr_revise_multiallelics + RuntimeAttr? runtime_attr_postprocess + + RuntimeAttr? runtime_override_clean_vcf_1a + RuntimeAttr? runtime_override_clean_vcf_1b + RuntimeAttr? runtime_override_clean_vcf_2 + RuntimeAttr? runtime_override_clean_vcf_3 + RuntimeAttr? runtime_override_clean_vcf_4 + RuntimeAttr? runtime_override_clean_vcf_5 + RuntimeAttr? runtime_override_stitch_fragmented_cnvs + RuntimeAttr? runtime_override_final_cleanup + RuntimeAttr? runtime_override_rescue_me_dels + RuntimeAttr? runtime_attr_add_high_fp_rate_filters + + RuntimeAttr? runtime_override_preconcat_step1 + RuntimeAttr? runtime_override_hail_merge_step1 + RuntimeAttr? runtime_override_fix_header_step1 + + RuntimeAttr? runtime_override_preconcat_drc + RuntimeAttr? runtime_override_hail_merge_drc + RuntimeAttr? runtime_override_fix_header_drc + + # overrides for MiniTasks + RuntimeAttr? runtime_override_split_vcf_to_clean + RuntimeAttr? runtime_override_split_include_list + RuntimeAttr? runtime_override_combine_clean_vcf_2 + RuntimeAttr? runtime_override_drop_redundant_cnvs + RuntimeAttr? runtime_override_combine_step_1_vcfs + RuntimeAttr? runtime_override_sort_drop_redundant_cnvs + RuntimeAttr? runtime_attr_format + } + + call fvcf.FormatVcf as FormatVcfToClean { + input: + vcf=vcf, + ploidy_table=ploidy_table, + output_prefix="~{prefix}.formatted", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_format + } + + call CleanVcfPreprocess { + input: + vcf=FormatVcfToClean.out, + background_list=background_list, + bothsides_pass_list=bothsides_pass_list, + prefix="~{prefix}.preprocess", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_preprocess + } + + call CleanVcfReviseOverlappingCnvGts { + input: + vcf=CleanVcfPreprocess.out, + prefix="~{prefix}.revise_overlapping_cnv_gts", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_revise_overlapping_cnv_gts + } + + call CleanVcfReviseOverlappingCnvCns { + input: + vcf=CleanVcfReviseOverlappingCnvGts.out, + prefix="~{prefix}.revise_overlapping_cnv_cns", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_revise_overlapping_cnv_cns + } + + call CleanVcfReviseLargeCnvs { + input: + vcf=CleanVcfReviseOverlappingCnvGts.out, + outlier_samples_list=outlier_samples_list, + prefix="~{prefix}.revise_large_cnvs", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_revise_large_cnvs + } + + call CleanVcfReviseAbnormalAllosomes { + input: + vcf=CleanVcfReviseLargeCnvs.out, + prefix="~{prefix}.revise_abnormal_allosomes", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_revise_abnormal_allosomes + } + + call CleanVcfReviseMultiallelicCnvs { + input: + vcf=CleanVcfReviseAbnormalAllosomes.out, + prefix="~{prefix}.revise_multiallelic_cnvs", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_revise_multiallelics + } + + call CleanVcfPostprocess { + input: + vcf=CleanVcfReviseMultiallelicCnvs.out, + prefix="~{prefix}.postprocess", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_postprocess + } + + call DropRedundantCnvs { + input: + vcf=CleanVcfPostprocess.out, + prefix="~{prefix}.drop_redundant_cnvs", + contig=contig, + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_drop_redundant_cnvs + } + + if (use_hail) { + call HailMerge.HailMerge as SortDropRedundantCnvsHail { + input: + vcfs=[DropRedundantCnvs.out], + prefix="~{prefix}.drop_redundant_cnvs.sorted", + gcs_project=gcs_project, + reset_cnv_gts=true, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + runtime_override_preconcat=runtime_override_preconcat_drc, + runtime_override_hail_merge=runtime_override_hail_merge_drc, + runtime_override_fix_header=runtime_override_fix_header_drc + } + } + if (!use_hail) { + call MiniTasks.SortVcf as SortDropRedundantCnvs { + input: + vcf=DropRedundantCnvs.out, + outfile_prefix="~{prefix}.drop_redundant_cnvs.sorted", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_sort_drop_redundant_cnvs + } + } + + call StitchFragmentedCnvs { + input: + vcf=select_first([SortDropRedundantCnvs.out, SortDropRedundantCnvsHail.merged_vcf]), + prefix="~{prefix}.stitch_fragmented_cnvs", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_stitch_fragmented_cnvs + } + + call RescueMobileElementDeletions { input: vcf = StitchFragmentedCnvs.stitched_vcf_shard, prefix = "~{prefix}.rescue_me_dels", @@ -317,495 +205,577 @@ workflow CleanVcfChromosome { runtime_attr_override=runtime_override_final_cleanup } - call fvcf.FormatVcf { + call fvcf.FormatVcf as FormatVcfToOutput { input: vcf=FinalCleanup.final_cleaned_shard, ploidy_table=ploidy_table, args="--scale-down-gq", output_prefix="~{prefix}.final_format", - script=svtk_to_gatk_script, sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_attr_format } - - output { - File out = FormatVcf.out - File out_idx = FormatVcf.out_index - } + + output { + File out = FormatVcfToOutput.out + File out_idx = FormatVcfToOutput.out_index + } } - -task CleanVcf1a { - input { - File vcf - String prefix - File background_fail_list - File bothsides_pass_list - File ped_file - File allosome_fai - String chr_x - String chr_y - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([vcf, background_fail_list, bothsides_pass_list], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - - touch ~{prefix}.includelist.txt - touch ~{prefix}.sexchr.revise.txt - - # outputs - # includelist.txt: the names of all the samples in the input vcf - # sexchr.revise.txt: the names of the events where genotypes got tweaked on allosomes - # stdout: a revised vcf - java -jar $CLEAN_VCF_PART_1_JAR \ - ~{vcf} \ - ~{ped_file} \ - ~{chr_x} \ - ~{chr_y} \ - ~{background_fail_list} \ - ~{bothsides_pass_list} \ - ~{prefix}.includelist.txt \ - ~{prefix}.sexchr.revise.txt \ - | bgzip \ - > ~{prefix}.vcf.gz - tabix ~{prefix}.vcf.gz - >>> - - output { - File include_list="~{prefix}.includelist.txt" - File sex="~{prefix}.sexchr.revise.txt" - File intermediate_vcf="~{prefix}.vcf.gz" - File intermediate_vcf_idx="~{prefix}.vcf.gz.tbi" - } +task CleanVcfPreprocess { + input { + File vcf + File background_list + File bothsides_pass_list + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + python /opt/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py \ + ~{vcf} \ + processed.vcf.gz + + zgrep '^##' processed.vcf.gz > header.txt + + cat <> header.txt + ##FILTER= + ##INFO= + ##INFO= + ##INFO= + EOF + + zgrep '^#CHROM' processed.vcf.gz >> header.txt + + bcftools view processed.vcf.gz | bcftools reheader -h header.txt | bgzip -c > processed.reheader.vcf.gz + + rm header.txt + + python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py \ + -V processed.reheader.vcf.gz \ + -O ~{output_vcf} \ + --fail-list ~{background_list} \ + --pass-list ~{bothsides_pass_list} + + tabix -p vcf ~{output_vcf} + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } } -task CleanVcf2 { - input { - File normal_revise_vcf - String prefix - File include_list - File multi_cnvs - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size([normal_revise_vcf, include_list, multi_cnvs], "GB") - Float base_disk_gb = 10.0 - Float input_disk_scale = 3.0 - RuntimeAttr runtime_default = object { - mem_gb: 2.0, - disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -eu -o pipefail - - bcftools index ~{normal_revise_vcf} - /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh \ - ~{normal_revise_vcf} \ - ~{include_list} \ - ~{multi_cnvs} \ - "~{prefix}.txt" - >>> - - output { - File out="~{prefix}.txt" - } +task CleanVcfReviseOverlappingCnvGts { + input { + File vcf + String prefix + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvGts \ + -V ~{vcf} \ + -O ~{output_vcf} + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } } - -task CleanVcf3 { - input { - File rd_cn_revise - Int? max_samples_shard - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - Int max_samples_shard_ = select_first([max_samples_shard, 7000]) - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size(rd_cn_revise, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py ~{rd_cn_revise} -s ~{max_samples_shard_} - # Ensure there is at least one shard - touch shards/out.0_0.txt - >>> - - output { - Array[File] shards = glob("shards/*") - } +task CleanVcfReviseOverlappingCnvCns { + input { + File vcf + String prefix + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvCns \ + -V ~{vcf} \ + -O ~{output_vcf} + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } } +task CleanVcfReviseLargeCnvs { + input { + File vcf + File? outlier_samples_list + String prefix + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseLargeCnvs \ + -V ~{vcf} \ + -O ~{output_vcf} \ + ~{if defined(outlier_samples_list) then "--outlier-samples ~{outlier_samples_list}" else "" } + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } +} -task CleanVcf4 { - input { - File rd_cn_revise - File normal_revise_vcf - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([rd_cn_revise, normal_revise_vcf], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 2.0, - disk_gb: 50, - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } +task CleanVcfReviseAbnormalAllosomes { + input { + File vcf + String prefix + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseAbnormalAllosomes \ + -V ~{vcf} \ + -O ~{output_vcf} + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } +} - command <<< - set -euo pipefail - python3 < record_end: - break - num_gt_over_2 = 0 - for sid in record.samples: - s = record.samples[sid] - # Pick best GT - if s['PE_GT'] is None: - continue - elif s['SR_GT'] is None: - gt = s['PE_GT'] - elif s['PE_GT'] > 0 and s['SR_GT'] == 0: - gt = s['PE_GT'] - elif s['PE_GT'] == 0: - gt = s['SR_GT'] - elif s['PE_GQ'] >= s['SR_GQ']: - gt = s['PE_GT'] - else: - gt = s['SR_GT'] - if gt > 2: - num_gt_over_2 += 1 - if num_gt_over_2 > max_vf: - multi_geno_ids.add(record.id) - vcf.close() - - multi_geno_ids = sorted(list(multi_geno_ids)) - with open("~{prefix}.multi_geno_ids.txt", "w") as f: - for vid in multi_geno_ids: - f.write(vid + "\n") - CODE - - bgzip ~{prefix}.revise_vcf_lines.txt - gzip ~{prefix}.multi_geno_ids.txt - >>> +task CleanVcfReviseMultiallelicCnvs { + input { + File vcf + String prefix + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseMultiallelicCnvs \ + -V ~{vcf} \ + -O ~{output_vcf} + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } +} - output { - File out="~{prefix}.revise_vcf_lines.txt.gz" - File multi_ids="~{prefix}.multi_geno_ids.txt.gz" - } +task CleanVcfPostprocess { + input { + File vcf + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + bcftools annotate -x INFO/MULTIALLELIC,INFO/UNRESOLVED,INFO/EVENT,INFO/REVISED_EVENT,INFO/MULTI_CNV,INFO/varGQ ~{vcf} -o processed.vcf.gz -O z + + bcftools view -h processed.vcf.gz | grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=|##ALT=|##INFO= header.txt + + bcftools reheader -h header.txt processed.vcf.gz -o ~{output_vcf} + + tabix -p vcf ~{output_vcf} + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } } task RescueMobileElementDeletions { - input { - File vcf - String prefix - File LINE1 - File HERVK - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GiB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75 + input_size * 1.5, - disk_gb: ceil(100.0 + input_size * 3.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - - python <.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv - bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv + bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{LINE1} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv + bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv - python <',) - if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK': - record.alts = ('',) - fo.write(record) + if record.id in hash_MEI_DEL_reset.keys(): + del record.filter['UNRESOLVED'] + record.info['SVTYPE'] = 'DEL' + record.info['SVLEN'] = record.info['END2'] - record.start + record.stop = record.info['END2'] + record.info.pop("CHR2") + record.info.pop("END2") + record.info.pop("UNRESOLVED_TYPE") + if hash_MEI_DEL_reset[record.id] == 'overlap_LINE1': + record.alts = ('',) + if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK': + record.alts = ('',) + fo.write(record) fin.close() fo.close() CODE - >>> + >>> - output { - File out = "~{prefix}.vcf.gz" - } + output { + File out = "~{prefix}.vcf.gz" + } } # Remove CNVs that are redundant with CPX events or other CNVs task DropRedundantCnvs { - input { - File vcf - String prefix - String contig - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GiB") - # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor: - # in tests on large VCFs, memory usage is ~1.0 * input VCF size - # the biggest disk usage is at the end of the task, with input + output VCF on disk - Int cpu_cores = 2 # speed up compression / decompression of VCFs - RuntimeAttr runtime_default = object { - mem_gb: 3.75 + input_size * 1.5, - disk_gb: ceil(100.0 + input_size * 2.0), - cpu_cores: cpu_cores, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - /opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \ - ~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp - >>> - - output { - File out = "~{prefix}.vcf.gz" - } + input { + File vcf + String prefix + String contig + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf, "GiB") + # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor: + # in tests on large VCFs, memory usage is ~1.0 * input VCF size + # the biggest disk usage is at the end of the task, with input + output VCF on disk + Int cpu_cores = 2 # speed up compression / decompression of VCFs + RuntimeAttr runtime_default = object { + mem_gb: 3.75 + input_size * 1.5, + disk_gb: ceil(100.0 + input_size * 2.0), + cpu_cores: cpu_cores, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + /opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \ + ~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp + >>> + + output { + File out = "~{prefix}.vcf.gz" + } } # Stitch fragmented RD-only calls found in 100% of the same samples task StitchFragmentedCnvs { - input { - File vcf - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb]) - Int java_mem_mb = ceil(mem_gb * 1000 * 0.8) - - runtime { - memory: "~{mem_gb} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - echo "First pass..." - java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \ - | bgzip \ - > tmp.vcf.gz - rm ~{vcf} - echo "Second pass..." - java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \ - | bgzip \ - > ~{prefix}.vcf.gz - >>> - - output { - File stitched_vcf_shard = "~{prefix}.vcf.gz" - } + input { + File vcf + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 7.5, + disk_gb: ceil(10.0 + input_size * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + Int java_mem_mb = ceil(mem_gb * 1000 * 0.8) + + runtime { + memory: "~{mem_gb} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + echo "First pass..." + java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \ + | bgzip \ + > tmp.vcf.gz + rm ~{vcf} + echo "Second pass..." + java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \ + | bgzip \ + > ~{prefix}.vcf.gz + >>> + + output { + File stitched_vcf_shard = "~{prefix}.vcf.gz" + } } # Add FILTER status for pockets of variants with high FP rate: wham-only DELs and Scramble-only SVAs with HIGH_SR_BACKGROUND @@ -863,53 +833,53 @@ CODE # Final VCF cleanup task FinalCleanup { - input { - File vcf - String contig - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size(vcf, "GB") - Float base_disk_gb = 10.0 - Float base_mem_gb = 2.0 - Float input_mem_scale = 3.0 - Float input_disk_scale = 5.0 - RuntimeAttr runtime_default = object { - mem_gb: base_mem_gb + input_size * input_mem_scale, - disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -eu -o pipefail - - /opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \ - --chrom ~{contig} \ - --prefix ~{prefix} \ - ~{vcf} stdout \ - | bcftools annotate --no-version -e 'SVTYPE=="CNV" && SVLEN<5000' -x INFO/MEMBERS -Oz -o ~{prefix}.vcf.gz - tabix ~{prefix}.vcf.gz - >>> - - output { - File final_cleaned_shard = "~{prefix}.vcf.gz" - File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi" - } + input { + File vcf + String contig + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed + # generally assume working memory is ~3 * inputs + Float input_size = size(vcf, "GB") + Float base_disk_gb = 10.0 + Float base_mem_gb = 2.0 + Float input_mem_scale = 3.0 + Float input_disk_scale = 5.0 + RuntimeAttr runtime_default = object { + mem_gb: base_mem_gb + input_size * input_mem_scale, + disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -eu -o pipefail + + /opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \ + --chrom ~{contig} \ + --prefix ~{prefix} \ + ~{vcf} stdout \ + | bcftools annotate --no-version -e 'SVTYPE=="CNV" && SVLEN<5000' -x INFO/MEMBERS -Oz -o ~{prefix}.vcf.gz + tabix ~{prefix}.vcf.gz + >>> + + output { + File final_cleaned_shard = "~{prefix}.vcf.gz" + File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi" + } } \ No newline at end of file