From 095a2c7f85d450d17c75231463bb61cf85de1815 Mon Sep 17 00:00:00 2001 From: boasvdp Date: Tue, 16 Apr 2024 19:16:28 +0200 Subject: [PATCH 1/3] feat: annotate large deletions based on bed file --- Snakefile | 4 +- config/presets.yaml | 1 + files/mtb/resistance_regions_tbdb.bed | 73 +++++++++++++++++++ workflow/rules/choose_species.smk | 1 + workflow/rules/mtb_prepare_files.smk | 18 +++++ workflow/rules/mtb_typing.smk | 53 ++++++++++++++ .../scripts/postprocess_deletion_table.py | 68 +++++++++++++++++ 7 files changed, 215 insertions(+), 3 deletions(-) create mode 100644 files/mtb/resistance_regions_tbdb.bed create mode 100644 workflow/scripts/postprocess_deletion_table.py diff --git a/Snakefile b/Snakefile index 95dea80..8719d8e 100644 --- a/Snakefile +++ b/Snakefile @@ -24,11 +24,9 @@ localrules: aggregate_species, copy_sample_bam, copy_sample_vcf, + copy_deletion_vcf, copy_ref, - prepare_ab_table, - generate_ab_table_header, mtb_filter_res_table_positions, - postprocess_variant_table, mtb_make_json, audit_version_gatk, audit_version_biopython, diff --git a/config/presets.yaml b/config/presets.yaml index f9e3972..fdde7c6 100644 --- a/config/presets.yaml +++ b/config/presets.yaml @@ -5,6 +5,7 @@ mycobacterium_tuberculosis: resistance_variants_csv: /mnt/db/juno/variant-typing/mtb/reslist.csv resistance_variants_columns: ref_reslist,alt_reslist,gene,drug,RIVM,TP,FP,confidence,variant_common_name resistance_variants_ab_column: drug + resistance_deletions_bed: files/mtb/resistance_regions_tbdb.bed consensus: proximity_threshold: 12 AF_threshold: 0.8 diff --git a/files/mtb/resistance_regions_tbdb.bed b/files/mtb/resistance_regions_tbdb.bed new file mode 100644 index 0000000..8e230c1 --- /dev/null +++ b/files/mtb/resistance_regions_tbdb.bed @@ -0,0 +1,73 @@ +NC_000962.3 1 1524 Rv0001 dnaA isoniazid +NC_000962.3 4933 7267 Rv0005 gyrB levofloxacin,moxifloxacin +NC_000962.3 7068 9818 Rv0006 gyrA levofloxacin,moxifloxacin +NC_000962.3 13133 13911 Rv0010c Rv0010c isoniazid +NC_000962.3 490545 491793 Rv0407 fgd1 clofazimine,pretomanid,delamanid +NC_000962.3 574479 576790 Rv0486 mshA ethionamide,isoniazid +NC_000962.3 619500 620865 Rv0529 ccsA capreomycin,amikacin,kanamycin +NC_000962.3 656010 657739 Rv0565c Rv0565c ethionamide +NC_000962.3 731680 732406 Rv0635 hadA isoniazid +NC_000962.3 733853 734970 Rv0639 nusG rifampicin +NC_000962.3 759342 763325 Rv0667 rpoB rifampicin +NC_000962.3 763127 767320 Rv0668 rpoC rifampicin +NC_000962.3 775586 778680 Rv0676c mmpL5 clofazimine,bedaquiline +NC_000962.3 778477 779624 Rv0677c mmpS5 clofazimine,bedaquiline +NC_000962.3 778790 779487 Rv0678 mmpR5 clofazimine,bedaquiline +NC_000962.3 781126 781934 Rv0682 rpsL streptomycin +NC_000962.3 800106 801462 Rv0701 rplC linezolid +NC_000962.3 1253074 1254783 Rv1129c Rv1129c moxifloxacin,levofloxacin,rifampicin,isoniazid +NC_000962.3 1302606 1305501 Rv1173 fbiC clofazimine,pretomanid,delamanid +NC_000962.3 1364162 1365186 Rv1221 sigE pyrazinamide +NC_000962.3 1406081 1407604 Rv1258c Rv1258c streptomycin,isoniazid,pyrazinamide +NC_000962.3 1416181 1418048 Rv1267c embR ethambutol +NC_000962.3 1460802 1461290 Rv1305 atpE bedaquiline +NC_000962.3 1471498 1473382 EBG00000313325 rrs streptomycin,kanamycin,capreomycin,amikacin +NC_000962.3 1473408 1476795 EBG00000313339 rrl capreomycin,linezolid +NC_000962.3 1673148 1674183 Rv1483 fabG1 ethionamide,isoniazid +NC_000962.3 1673189 1675011 Rv1484 inhA ethionamide,isoniazid +NC_000962.3 1833247 1834987 Rv1630 rpsA pyrazinamide +NC_000962.3 1853358 1854388 Rv1644 tsnR linezolid +NC_000962.3 1917506 1918746 Rv1694 tlyA capreomycin +NC_000962.3 2062809 2065010 Rv1819c bacA streptomycin,kanamycin,capreomycin,amikacin +NC_000962.3 2101651 2103337 Rv1854c ndh ethionamide,delamanid,isoniazid +NC_000962.3 2153889 2156842 Rv1908c katG isoniazid +NC_000962.3 2167649 2170934 Rv1918c PPE35 pyrazinamide +NC_000962.3 2221719 2223825 Rv1979c Rv1979c clofazimine,bedaquiline +NC_000962.3 2288681 2290323 Rv2043c pncA pyrazinamide +NC_000962.3 2517915 2519365 Rv2245 kasA isoniazid +NC_000962.3 2714124 2715832 Rv2416c eis amikacin,kanamycin +NC_000962.3 2725899 2726780 Rv2428 ahpC isoniazid +NC_000962.3 2746135 2747798 Rv2447c folC para-aminosalicylic_acid +NC_000962.3 2782366 2786169 Rv2477c Rv2477c ethambutol,moxifloxacin,streptomycin,levofloxacin,kanamycin,rifampicin,amikacin +NC_000962.3 2859300 2860640 Rv2535c pepQ clofazimine,bedaquiline +NC_000962.3 2986639 2987615 Rv2671 ribD para-aminosalicylic_acid +NC_000962.3 2995772 2996737 Rv2680 Rv2680 capreomycin +NC_000962.3 2996539 2998055 Rv2681 Rv2681 capreomycin +NC_000962.3 3064515 3067372 Rv2752c Rv2752c ethambutol,moxifloxacin,levofloxacin,rifampicin,isoniazid +NC_000962.3 3067193 3068161 Rv2754c thyX para-aminosalicylic_acid +NC_000962.3 3073680 3074671 Rv2764c thyA para-aminosalicylic_acid +NC_000962.3 3086620 3087935 Rv2780 ald cycloserine +NC_000962.3 3338868 3339762 Rv2983 fbiD clofazimine,pretomanid,delamanid +NC_000962.3 3448253 3449991 Rv3083 Rv3083 ethionamide +NC_000962.3 3568401 3569280 Rv3197A whiB7 streptomycin,amikacin,kanamycin +NC_000962.3 3611959 3613847 Rv3236c Rv3236c pyrazinamide +NC_000962.3 3623159 3625110 Rv3244c lpqB bedaquiline,rifampicin +NC_000962.3 3624910 3626860 Rv3245c mtrB bedaquiline,rifampicin +NC_000962.3 3626663 3627924 Rv3246c mtrA bedaquiline,rifampicin +NC_000962.3 3640207 3641538 Rv3261 fbiA clofazimine,pretomanid,delamanid +NC_000962.3 3641335 3642881 Rv3262 fbiB clofazimine,pretomanid,delamanid +NC_000962.3 3840194 3841620 Rv3423c alr cycloserine +NC_000962.3 3877464 3879240 Rv3457c rpoA rifampicin +NC_000962.3 3986612 3987299 Rv3547 ddn delamanid,pretomanid +NC_000962.3 4038158 4041013 Rv3596c clpC1 pyrazinamide +NC_000962.3 4043862 4046428 Rv3601c panD pyrazinamide +NC_000962.3 4138202 4140002 Rv3696c glpK ethambutol,moxifloxacin,streptomycin,levofloxacin,rifampicin,isoniazid +NC_000962.3 4237683 4243147 Rv3793 embC ethambutol +NC_000962.3 4242947 4246517 Rv3794 embA ethambutol +NC_000962.3 4246314 4249810 Rv3795 embB ethambutol +NC_000962.3 4266953 4269124 Rv3805c aftB ethambutol +NC_000962.3 4268925 4270084 Rv3806c ubiA ethambutol +NC_000962.3 4326004 4330174 Rv3854c ethA ethionamide +NC_000962.3 4327328 4328199 Rv3855 ethR ethionamide +NC_000962.3 4338171 4338961 Rv3862c whiB6 capreomycin,amikacin,kanamycin +NC_000962.3 4407528 4408481 Rv3919c gid streptomycin \ No newline at end of file diff --git a/workflow/rules/choose_species.smk b/workflow/rules/choose_species.smk index 0904c19..72b371f 100644 --- a/workflow/rules/choose_species.smk +++ b/workflow/rules/choose_species.smk @@ -8,6 +8,7 @@ def choose_species(wildcards): OUT + "/mtb_typing/contamination_check/rrs_rrl_contamination/{sample}.tsv", OUT + "/mtb_typing/seq_exp_json/{sample}.json", OUT + "/mtb_typing/consensus/{sample}.fasta", + OUT + "/mtb_typing/annotated_deletions/{sample}.tsv", ] else: return OUT + "/typing_check/{sample}/no_typing_necessary.txt" diff --git a/workflow/rules/mtb_prepare_files.smk b/workflow/rules/mtb_prepare_files.smk index 7238ff5..0d13f0f 100644 --- a/workflow/rules/mtb_prepare_files.smk +++ b/workflow/rules/mtb_prepare_files.smk @@ -217,3 +217,21 @@ DB_NAME=$(basename {input.db_dir}) cd $WORKDIR $EXEC build -genbank -v $DB_NAME -config $CONFIG_NAME -dataDir . 2>&1>{log} """ + + +rule copy_deletion_vcf: + input: + vcf=INPUT + "/deletions/{sample}.vcf", + output: + vcf=OUT + "/mtb_typing/prepared_files/deletions/{sample}.vcf", + log: + OUT + "/log/copy_deletion_vcf/{sample}.log", + message: + "Copying deletion vcf for {wildcards.sample}" + threads: config["threads"]["bcftools"] + resources: + mem_gb=config["mem_gb"]["bcftools"], + shell: + """ +cp {input.vcf} {output.vcf} 2>&1>{log} + """ diff --git a/workflow/rules/mtb_typing.smk b/workflow/rules/mtb_typing.smk index d219e13..bac3849 100644 --- a/workflow/rules/mtb_typing.smk +++ b/workflow/rules/mtb_typing.smk @@ -237,6 +237,59 @@ python workflow/scripts/create_tb_json.py \ """ +rule mtb_deletions_to_table: + input: + vcf=OUT + "/mtb_typing/prepared_files/deletions/{sample}.vcf", + output: + OUT + "/mtb_typing/annotated_deletions/raw/{sample}.tsv", + conda: + "../envs/gatk_picard.yaml" + container: + "docker://broadinstitute/gatk:4.3.0.0" + log: + OUT + "/log/mtb_deletions_to_table/{sample}.log", + message: + "Convert deletion vcf to table for {wildcards.sample}" + threads: config["threads"]["gatk"] + resources: + mem_gb=config["mem_gb"]["gatk"], + shell: + """ +gatk VariantsToTable \ +-V {input.vcf} \ +--show-filtered \ +-F CHROM \ +-F POS \ +-F END \ +-O {output} 2>&1>{log} + """ + + +rule mtb_annotate_deletions: + input: + bed=OUT + "/mtb_typing/annotated_deletions/raw/{sample}.tsv", + resistance_deletions_bed=lambda wildcards: SAMPLES[wildcards.sample][ + "resistance_deletions_bed" + ], + output: + tsv=OUT + "/mtb_typing/annotated_deletions/{sample}.tsv", + log: + OUT + "/log/mtb_annotate_deletions/{sample}.log", + message: + "Annotating deletions with AMR for {wildcards.sample}" + threads: config["threads"]["other"] + resources: + mem_gb=config["mem_gb"]["other"], + shell: + """ +python workflow/scripts/postprocess_deletion_table.py \ +--input {input.bed} \ +--bed {input.resistance_deletions_bed} \ +--output {output} \ +2>&1>{log} + """ + + module consensus_workflow: config: config diff --git a/workflow/scripts/postprocess_deletion_table.py b/workflow/scripts/postprocess_deletion_table.py new file mode 100644 index 0000000..37acd1f --- /dev/null +++ b/workflow/scripts/postprocess_deletion_table.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +import pandas as pd + + +def check_deletions_in_bed(deletions, bed): + """ + Check if deletions are in the reference bed file + + Parameters + ---------- + deletions : pd.DataFrame + Deletions table, should contain the columns 'CHROM', 'POS', 'END' and 'interval' + bed : pd.DataFrame + Bed file, should contain the columns 'chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs' and 'interval' + + Returns + ------- + pd.DataFrame + Annotated deletions table, containing the columns 'chrom', 'start', 'end', 'locus_tag', 'gene' and 'drugs' + + Notes + ----- + Compares genomic ranges stored as pd.Interval objects, with closed bounds + + """ + deletions_annotated = [] + for index, row in deletions.iterrows(): + for index2, row2 in bed.iterrows(): + if row['CHROM'] == row2['chrom']: + if row['interval'].overlaps(row2['interval']): + dictionary_overlap = { + 'chrom': row['CHROM'], + 'start': row['POS'], + 'end': row['END'], + 'locus_tag': row2['locus_tag'], + 'gene': row2['gene'], + 'drugs': row2['drugs'] + } + deletions_annotated.append(dictionary_overlap) + + return pd.DataFrame(deletions_annotated) + + +def main(args): + # Read the deletion table + deletions = pd.read_csv(args.input, sep='\t', header=0) + deletions['interval'] = deletions.apply(lambda x: pd.Interval(x['POS'], x['END'], closed='both'), axis=1) + + # Read the reference bed file + bed = pd.read_csv(args.bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs']) + bed['interval'] = bed.apply(lambda x: pd.Interval(x['start'], x['end'], closed='both'), axis=1) + + annotated_deletions = check_deletions_in_bed(deletions, bed) + + # Write the annotatetd deletion table + annotated_deletions.to_csv(args.output, sep='\t', index=False) + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='Filter deletions that are not in the reference genome') + parser.add_argument('--input', help='Deletion table', required=True) + parser.add_argument('--bed', help='Filtered deletion table', required=True) + parser.add_argument('--output', help='Output file', default='deletions_annotated.tsv') + args = parser.parse_args() + + main(args) From 83d143fcd472bb79be964fbfafb750a14cdb5ea6 Mon Sep 17 00:00:00 2001 From: boasvdp Date: Thu, 18 Apr 2024 09:45:00 +0200 Subject: [PATCH 2/3] fix: dont fail when no deletions are found --- .../scripts/postprocess_deletion_table.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/workflow/scripts/postprocess_deletion_table.py b/workflow/scripts/postprocess_deletion_table.py index 37acd1f..a550ead 100644 --- a/workflow/scripts/postprocess_deletion_table.py +++ b/workflow/scripts/postprocess_deletion_table.py @@ -45,16 +45,22 @@ def check_deletions_in_bed(deletions, bed): def main(args): # Read the deletion table deletions = pd.read_csv(args.input, sep='\t', header=0) - deletions['interval'] = deletions.apply(lambda x: pd.Interval(x['POS'], x['END'], closed='both'), axis=1) - # Read the reference bed file - bed = pd.read_csv(args.bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs']) - bed['interval'] = bed.apply(lambda x: pd.Interval(x['start'], x['end'], closed='both'), axis=1) + if deletions.shape[0] == 0: + with open(args.output, 'w') as f: + f.write('chrom\tstart\tend\tlocus_tag\tgene\tdrugs\n') - annotated_deletions = check_deletions_in_bed(deletions, bed) - - # Write the annotatetd deletion table - annotated_deletions.to_csv(args.output, sep='\t', index=False) + else: + deletions['interval'] = deletions.apply(lambda x: pd.Interval(x['POS'], x['END'], closed='both'), axis=1) + + # Read the reference bed file + bed = pd.read_csv(args.bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs']) + bed['interval'] = bed.apply(lambda x: pd.Interval(x['start'], x['end'], closed='both'), axis=1) + + annotated_deletions = check_deletions_in_bed(deletions, bed) + + # Write the annotatetd deletion table + annotated_deletions.to_csv(args.output, sep='\t', index=False) if __name__ == '__main__': import argparse From d22d6bb9957da83ede3dbdd12b6fc536c20d5396 Mon Sep 17 00:00:00 2001 From: boasvdp Date: Thu, 18 Apr 2024 09:46:42 +0200 Subject: [PATCH 3/3] style: rm dicts from samplesheet for readability --- config/presets.yaml | 18 ++++++++---------- workflow/rules/consensus.smk | 15 +++++---------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/config/presets.yaml b/config/presets.yaml index fdde7c6..7447590 100644 --- a/config/presets.yaml +++ b/config/presets.yaml @@ -6,11 +6,10 @@ mycobacterium_tuberculosis: resistance_variants_columns: ref_reslist,alt_reslist,gene,drug,RIVM,TP,FP,confidence,variant_common_name resistance_variants_ab_column: drug resistance_deletions_bed: files/mtb/resistance_regions_tbdb.bed - consensus: - proximity_threshold: 12 - AF_threshold: 0.8 - tlod_threshold: 10 - masking_depth: 10 + consensus_proximity_threshold: 12 + consensus_AF_threshold: 0.8 + consensus_tlod_threshold: 10 + consensus_masking_depth: 10 other: single_copy_bed: null count_mutations_bed: null @@ -18,8 +17,7 @@ other: resistance_variants_csv: null resistance_variants_columns: null resistance_variants_ab_column: null - consensus: - proximity_threshold: null - AF_threshold: null - tlod_threshold: null - masking_depth: null + consensus_proximity_threshold: null + consensus_AF_threshold: null + consensus_tlod_threshold: null + consensus_masking_depth: null diff --git a/workflow/rules/consensus.smk b/workflow/rules/consensus.smk index ff90b4c..fcd7055 100644 --- a/workflow/rules/consensus.smk +++ b/workflow/rules/consensus.smk @@ -23,8 +23,7 @@ rule mark_variants_by_proximity: log: "", params: - proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][ - "proximity_threshold" + proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_proximity_threshold" ], message: "Marking regions with variants too close together for {wildcards.sample}" @@ -51,8 +50,7 @@ rule subset_fixed_snps_from_vcf: log: "", params: - AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][ - "AF_threshold" + AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold" ], message: "Subsetting fixed snps from vcf for {wildcards.sample}" @@ -81,11 +79,9 @@ rule subset_low_confidence_variants_from_vcf: log: "", params: - AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][ - "AF_threshold" + AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold" ], - tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][ - "tlod_threshold" + tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_tlod_threshold" ], message: "Subsetting unfixed snps from vcf for {wildcards.sample}" @@ -171,8 +167,7 @@ rule mask_fasta_on_depth_from_bam: message: "Masking low depth regions of {wildcards.sample} consensus genome" params: - masking_depth=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][ - "masking_depth" + masking_depth=lambda wildcards: SAMPLES[wildcards.sample]["consensus_masking_depth" ], threads: config["threads"]["bedtools"] resources: