From 095a2c7f85d450d17c75231463bb61cf85de1815 Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Tue, 16 Apr 2024 19:16:28 +0200
Subject: [PATCH 1/3] feat: annotate large deletions based on bed file

---
 Snakefile                                     |  4 +-
 config/presets.yaml                           |  1 +
 files/mtb/resistance_regions_tbdb.bed         | 73 +++++++++++++++++++
 workflow/rules/choose_species.smk             |  1 +
 workflow/rules/mtb_prepare_files.smk          | 18 +++++
 workflow/rules/mtb_typing.smk                 | 53 ++++++++++++++
 .../scripts/postprocess_deletion_table.py     | 68 +++++++++++++++++
 7 files changed, 215 insertions(+), 3 deletions(-)
 create mode 100644 files/mtb/resistance_regions_tbdb.bed
 create mode 100644 workflow/scripts/postprocess_deletion_table.py

diff --git a/Snakefile b/Snakefile
index 95dea80..8719d8e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -24,11 +24,9 @@ localrules:
     aggregate_species,
     copy_sample_bam,
     copy_sample_vcf,
+    copy_deletion_vcf,
     copy_ref,
-    prepare_ab_table,
-    generate_ab_table_header,
     mtb_filter_res_table_positions,
-    postprocess_variant_table,
     mtb_make_json,
     audit_version_gatk,
     audit_version_biopython,
diff --git a/config/presets.yaml b/config/presets.yaml
index f9e3972..fdde7c6 100644
--- a/config/presets.yaml
+++ b/config/presets.yaml
@@ -5,6 +5,7 @@ mycobacterium_tuberculosis:
   resistance_variants_csv: /mnt/db/juno/variant-typing/mtb/reslist.csv
   resistance_variants_columns: ref_reslist,alt_reslist,gene,drug,RIVM,TP,FP,confidence,variant_common_name
   resistance_variants_ab_column: drug
+  resistance_deletions_bed: files/mtb/resistance_regions_tbdb.bed
   consensus:
     proximity_threshold: 12
     AF_threshold: 0.8
diff --git a/files/mtb/resistance_regions_tbdb.bed b/files/mtb/resistance_regions_tbdb.bed
new file mode 100644
index 0000000..8e230c1
--- /dev/null
+++ b/files/mtb/resistance_regions_tbdb.bed
@@ -0,0 +1,73 @@
+NC_000962.3	1	1524	Rv0001	dnaA	isoniazid
+NC_000962.3	4933	7267	Rv0005	gyrB	levofloxacin,moxifloxacin
+NC_000962.3	7068	9818	Rv0006	gyrA	levofloxacin,moxifloxacin
+NC_000962.3	13133	13911	Rv0010c	Rv0010c	isoniazid
+NC_000962.3	490545	491793	Rv0407	fgd1	clofazimine,pretomanid,delamanid
+NC_000962.3	574479	576790	Rv0486	mshA	ethionamide,isoniazid
+NC_000962.3	619500	620865	Rv0529	ccsA	capreomycin,amikacin,kanamycin
+NC_000962.3	656010	657739	Rv0565c	Rv0565c	ethionamide
+NC_000962.3	731680	732406	Rv0635	hadA	isoniazid
+NC_000962.3	733853	734970	Rv0639	nusG	rifampicin
+NC_000962.3	759342	763325	Rv0667	rpoB	rifampicin
+NC_000962.3	763127	767320	Rv0668	rpoC	rifampicin
+NC_000962.3	775586	778680	Rv0676c	mmpL5	clofazimine,bedaquiline
+NC_000962.3	778477	779624	Rv0677c	mmpS5	clofazimine,bedaquiline
+NC_000962.3	778790	779487	Rv0678	mmpR5	clofazimine,bedaquiline
+NC_000962.3	781126	781934	Rv0682	rpsL	streptomycin
+NC_000962.3	800106	801462	Rv0701	rplC	linezolid
+NC_000962.3	1253074	1254783	Rv1129c	Rv1129c	moxifloxacin,levofloxacin,rifampicin,isoniazid
+NC_000962.3	1302606	1305501	Rv1173	fbiC	clofazimine,pretomanid,delamanid
+NC_000962.3	1364162	1365186	Rv1221	sigE	pyrazinamide
+NC_000962.3	1406081	1407604	Rv1258c	Rv1258c	streptomycin,isoniazid,pyrazinamide
+NC_000962.3	1416181	1418048	Rv1267c	embR	ethambutol
+NC_000962.3	1460802	1461290	Rv1305	atpE	bedaquiline
+NC_000962.3	1471498	1473382	EBG00000313325	rrs	streptomycin,kanamycin,capreomycin,amikacin
+NC_000962.3	1473408	1476795	EBG00000313339	rrl	capreomycin,linezolid
+NC_000962.3	1673148	1674183	Rv1483	fabG1	ethionamide,isoniazid
+NC_000962.3	1673189	1675011	Rv1484	inhA	ethionamide,isoniazid
+NC_000962.3	1833247	1834987	Rv1630	rpsA	pyrazinamide
+NC_000962.3	1853358	1854388	Rv1644	tsnR	linezolid
+NC_000962.3	1917506	1918746	Rv1694	tlyA	capreomycin
+NC_000962.3	2062809	2065010	Rv1819c	bacA	streptomycin,kanamycin,capreomycin,amikacin
+NC_000962.3	2101651	2103337	Rv1854c	ndh	ethionamide,delamanid,isoniazid
+NC_000962.3	2153889	2156842	Rv1908c	katG	isoniazid
+NC_000962.3	2167649	2170934	Rv1918c	PPE35	pyrazinamide
+NC_000962.3	2221719	2223825	Rv1979c	Rv1979c	clofazimine,bedaquiline
+NC_000962.3	2288681	2290323	Rv2043c	pncA	pyrazinamide
+NC_000962.3	2517915	2519365	Rv2245	kasA	isoniazid
+NC_000962.3	2714124	2715832	Rv2416c	eis	amikacin,kanamycin
+NC_000962.3	2725899	2726780	Rv2428	ahpC	isoniazid
+NC_000962.3	2746135	2747798	Rv2447c	folC	para-aminosalicylic_acid
+NC_000962.3	2782366	2786169	Rv2477c	Rv2477c	ethambutol,moxifloxacin,streptomycin,levofloxacin,kanamycin,rifampicin,amikacin
+NC_000962.3	2859300	2860640	Rv2535c	pepQ	clofazimine,bedaquiline
+NC_000962.3	2986639	2987615	Rv2671	ribD	para-aminosalicylic_acid
+NC_000962.3	2995772	2996737	Rv2680	Rv2680	capreomycin
+NC_000962.3	2996539	2998055	Rv2681	Rv2681	capreomycin
+NC_000962.3	3064515	3067372	Rv2752c	Rv2752c	ethambutol,moxifloxacin,levofloxacin,rifampicin,isoniazid
+NC_000962.3	3067193	3068161	Rv2754c	thyX	para-aminosalicylic_acid
+NC_000962.3	3073680	3074671	Rv2764c	thyA	para-aminosalicylic_acid
+NC_000962.3	3086620	3087935	Rv2780	ald	cycloserine
+NC_000962.3	3338868	3339762	Rv2983	fbiD	clofazimine,pretomanid,delamanid
+NC_000962.3	3448253	3449991	Rv3083	Rv3083	ethionamide
+NC_000962.3	3568401	3569280	Rv3197A	whiB7	streptomycin,amikacin,kanamycin
+NC_000962.3	3611959	3613847	Rv3236c	Rv3236c	pyrazinamide
+NC_000962.3	3623159	3625110	Rv3244c	lpqB	bedaquiline,rifampicin
+NC_000962.3	3624910	3626860	Rv3245c	mtrB	bedaquiline,rifampicin
+NC_000962.3	3626663	3627924	Rv3246c	mtrA	bedaquiline,rifampicin
+NC_000962.3	3640207	3641538	Rv3261	fbiA	clofazimine,pretomanid,delamanid
+NC_000962.3	3641335	3642881	Rv3262	fbiB	clofazimine,pretomanid,delamanid
+NC_000962.3	3840194	3841620	Rv3423c	alr	cycloserine
+NC_000962.3	3877464	3879240	Rv3457c	rpoA	rifampicin
+NC_000962.3	3986612	3987299	Rv3547	ddn	delamanid,pretomanid
+NC_000962.3	4038158	4041013	Rv3596c	clpC1	pyrazinamide
+NC_000962.3	4043862	4046428	Rv3601c	panD	pyrazinamide
+NC_000962.3	4138202	4140002	Rv3696c	glpK	ethambutol,moxifloxacin,streptomycin,levofloxacin,rifampicin,isoniazid
+NC_000962.3	4237683	4243147	Rv3793	embC	ethambutol
+NC_000962.3	4242947	4246517	Rv3794	embA	ethambutol
+NC_000962.3	4246314	4249810	Rv3795	embB	ethambutol
+NC_000962.3	4266953	4269124	Rv3805c	aftB	ethambutol
+NC_000962.3	4268925	4270084	Rv3806c	ubiA	ethambutol
+NC_000962.3	4326004	4330174	Rv3854c	ethA	ethionamide
+NC_000962.3	4327328	4328199	Rv3855	ethR	ethionamide
+NC_000962.3	4338171	4338961	Rv3862c	whiB6	capreomycin,amikacin,kanamycin
+NC_000962.3	4407528	4408481	Rv3919c	gid	streptomycin
\ No newline at end of file
diff --git a/workflow/rules/choose_species.smk b/workflow/rules/choose_species.smk
index 0904c19..72b371f 100644
--- a/workflow/rules/choose_species.smk
+++ b/workflow/rules/choose_species.smk
@@ -8,6 +8,7 @@ def choose_species(wildcards):
             OUT + "/mtb_typing/contamination_check/rrs_rrl_contamination/{sample}.tsv",
             OUT + "/mtb_typing/seq_exp_json/{sample}.json",
             OUT + "/mtb_typing/consensus/{sample}.fasta",
+            OUT + "/mtb_typing/annotated_deletions/{sample}.tsv",
         ]
     else:
         return OUT + "/typing_check/{sample}/no_typing_necessary.txt"
diff --git a/workflow/rules/mtb_prepare_files.smk b/workflow/rules/mtb_prepare_files.smk
index 7238ff5..0d13f0f 100644
--- a/workflow/rules/mtb_prepare_files.smk
+++ b/workflow/rules/mtb_prepare_files.smk
@@ -217,3 +217,21 @@ DB_NAME=$(basename {input.db_dir})
 cd $WORKDIR
 $EXEC build -genbank -v $DB_NAME -config $CONFIG_NAME -dataDir . 2>&1>{log} 
         """
+
+
+rule copy_deletion_vcf:
+    input:
+        vcf=INPUT + "/deletions/{sample}.vcf",
+    output:
+        vcf=OUT + "/mtb_typing/prepared_files/deletions/{sample}.vcf",
+    log:
+        OUT + "/log/copy_deletion_vcf/{sample}.log",
+    message:
+        "Copying deletion vcf for {wildcards.sample}"
+    threads: config["threads"]["bcftools"]
+    resources:
+        mem_gb=config["mem_gb"]["bcftools"],
+    shell:
+        """
+cp {input.vcf} {output.vcf} 2>&1>{log}
+        """
diff --git a/workflow/rules/mtb_typing.smk b/workflow/rules/mtb_typing.smk
index d219e13..bac3849 100644
--- a/workflow/rules/mtb_typing.smk
+++ b/workflow/rules/mtb_typing.smk
@@ -237,6 +237,59 @@ python workflow/scripts/create_tb_json.py \
         """
 
 
+rule mtb_deletions_to_table:
+    input:
+        vcf=OUT + "/mtb_typing/prepared_files/deletions/{sample}.vcf",
+    output:
+        OUT + "/mtb_typing/annotated_deletions/raw/{sample}.tsv",
+    conda:
+        "../envs/gatk_picard.yaml"
+    container:
+        "docker://broadinstitute/gatk:4.3.0.0"
+    log:
+        OUT + "/log/mtb_deletions_to_table/{sample}.log",
+    message:
+        "Convert deletion vcf to table for {wildcards.sample}"
+    threads: config["threads"]["gatk"]
+    resources:
+        mem_gb=config["mem_gb"]["gatk"],
+    shell:
+        """
+gatk VariantsToTable \
+-V {input.vcf} \
+--show-filtered \
+-F CHROM \
+-F POS \
+-F END \
+-O {output} 2>&1>{log}
+        """
+
+
+rule mtb_annotate_deletions:
+    input:
+        bed=OUT + "/mtb_typing/annotated_deletions/raw/{sample}.tsv",
+        resistance_deletions_bed=lambda wildcards: SAMPLES[wildcards.sample][
+            "resistance_deletions_bed"
+        ],
+    output:
+        tsv=OUT + "/mtb_typing/annotated_deletions/{sample}.tsv",
+    log:
+        OUT + "/log/mtb_annotate_deletions/{sample}.log",
+    message:
+        "Annotating deletions with AMR for {wildcards.sample}"
+    threads: config["threads"]["other"]
+    resources:
+        mem_gb=config["mem_gb"]["other"],
+    shell:
+        """
+python workflow/scripts/postprocess_deletion_table.py \
+--input {input.bed} \
+--bed {input.resistance_deletions_bed} \
+--output {output} \
+2>&1>{log}
+        """
+
+
 module consensus_workflow:
     config:
         config
diff --git a/workflow/scripts/postprocess_deletion_table.py b/workflow/scripts/postprocess_deletion_table.py
new file mode 100644
index 0000000..37acd1f
--- /dev/null
+++ b/workflow/scripts/postprocess_deletion_table.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+
+
+def check_deletions_in_bed(deletions, bed):
+    """
+    Check if deletions are in the reference bed file
+
+    Parameters
+    ----------
+    deletions : pd.DataFrame
+        Deletions table, should contain the columns 'CHROM', 'POS', 'END' and 'interval'
+    bed : pd.DataFrame
+        Bed file, should contain the columns 'chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs' and 'interval'
+    
+    Returns
+    -------
+    pd.DataFrame
+        Annotated deletions table, containing the columns 'chrom', 'start', 'end', 'locus_tag', 'gene' and 'drugs'
+    
+    Notes
+    -----
+    Compares genomic ranges stored as pd.Interval objects, with closed bounds
+
+    """
+    deletions_annotated = []
+    for index, row in deletions.iterrows():
+        for index2, row2 in bed.iterrows():
+            if row['CHROM'] == row2['chrom']:
+                if row['interval'].overlaps(row2['interval']):
+                    dictionary_overlap = {
+                        'chrom': row['CHROM'],
+                        'start': row['POS'],
+                        'end': row['END'],
+                        'locus_tag': row2['locus_tag'],
+                        'gene': row2['gene'],
+                        'drugs': row2['drugs']
+                    }
+                    deletions_annotated.append(dictionary_overlap)
+    
+    return pd.DataFrame(deletions_annotated)                
+
+
+def main(args):
+    # Read the deletion table
+    deletions = pd.read_csv(args.input, sep='\t', header=0)
+    deletions['interval'] = deletions.apply(lambda x: pd.Interval(x['POS'], x['END'], closed='both'), axis=1)
+
+    # Read the reference bed file
+    bed = pd.read_csv(args.bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs'])
+    bed['interval'] = bed.apply(lambda x: pd.Interval(x['start'], x['end'], closed='both'), axis=1)
+
+    annotated_deletions = check_deletions_in_bed(deletions, bed)
+    
+    # Write the annotatetd deletion table
+    annotated_deletions.to_csv(args.output, sep='\t', index=False)
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Filter deletions that are not in the reference genome')
+    parser.add_argument('--input', help='Deletion table', required=True)
+    parser.add_argument('--bed', help='Filtered deletion table', required=True)
+    parser.add_argument('--output', help='Output file', default='deletions_annotated.tsv')
+    args = parser.parse_args()
+
+    main(args)

From 83d143fcd472bb79be964fbfafb750a14cdb5ea6 Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Thu, 18 Apr 2024 09:45:00 +0200
Subject: [PATCH 2/3] fix: dont fail when no deletions are found

---
 .../scripts/postprocess_deletion_table.py     | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/workflow/scripts/postprocess_deletion_table.py b/workflow/scripts/postprocess_deletion_table.py
index 37acd1f..a550ead 100644
--- a/workflow/scripts/postprocess_deletion_table.py
+++ b/workflow/scripts/postprocess_deletion_table.py
@@ -45,16 +45,22 @@ def check_deletions_in_bed(deletions, bed):
 def main(args):
     # Read the deletion table
     deletions = pd.read_csv(args.input, sep='\t', header=0)
-    deletions['interval'] = deletions.apply(lambda x: pd.Interval(x['POS'], x['END'], closed='both'), axis=1)
 
-    # Read the reference bed file
-    bed = pd.read_csv(args.bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs'])
-    bed['interval'] = bed.apply(lambda x: pd.Interval(x['start'], x['end'], closed='both'), axis=1)
+    if deletions.shape[0] == 0:
+        with open(args.output, 'w') as f:
+            f.write('chrom\tstart\tend\tlocus_tag\tgene\tdrugs\n')
 
-    annotated_deletions = check_deletions_in_bed(deletions, bed)
-    
-    # Write the annotatetd deletion table
-    annotated_deletions.to_csv(args.output, sep='\t', index=False)
+    else:
+        deletions['interval'] = deletions.apply(lambda x: pd.Interval(x['POS'], x['END'], closed='both'), axis=1)
+
+        # Read the reference bed file
+        bed = pd.read_csv(args.bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs'])
+        bed['interval'] = bed.apply(lambda x: pd.Interval(x['start'], x['end'], closed='both'), axis=1)
+
+        annotated_deletions = check_deletions_in_bed(deletions, bed)
+
+        # Write the annotatetd deletion table
+        annotated_deletions.to_csv(args.output, sep='\t', index=False)
 
 if __name__ == '__main__':
     import argparse

From d22d6bb9957da83ede3dbdd12b6fc536c20d5396 Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Thu, 18 Apr 2024 09:46:42 +0200
Subject: [PATCH 3/3] style: rm dicts from samplesheet for readability

---
 config/presets.yaml          | 18 ++++++++----------
 workflow/rules/consensus.smk | 15 +++++----------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/config/presets.yaml b/config/presets.yaml
index fdde7c6..7447590 100644
--- a/config/presets.yaml
+++ b/config/presets.yaml
@@ -6,11 +6,10 @@ mycobacterium_tuberculosis:
   resistance_variants_columns: ref_reslist,alt_reslist,gene,drug,RIVM,TP,FP,confidence,variant_common_name
   resistance_variants_ab_column: drug
   resistance_deletions_bed: files/mtb/resistance_regions_tbdb.bed
-  consensus:
-    proximity_threshold: 12
-    AF_threshold: 0.8
-    tlod_threshold: 10
-    masking_depth: 10
+  consensus_proximity_threshold: 12
+  consensus_AF_threshold: 0.8
+  consensus_tlod_threshold: 10
+  consensus_masking_depth: 10
 other:
   single_copy_bed: null
   count_mutations_bed: null
@@ -18,8 +17,7 @@ other:
   resistance_variants_csv: null
   resistance_variants_columns: null
   resistance_variants_ab_column: null
-  consensus:
-    proximity_threshold: null
-    AF_threshold: null
-    tlod_threshold: null
-    masking_depth: null
+  consensus_proximity_threshold: null
+  consensus_AF_threshold: null
+  consensus_tlod_threshold: null
+  consensus_masking_depth: null
diff --git a/workflow/rules/consensus.smk b/workflow/rules/consensus.smk
index ff90b4c..fcd7055 100644
--- a/workflow/rules/consensus.smk
+++ b/workflow/rules/consensus.smk
@@ -23,8 +23,7 @@ rule mark_variants_by_proximity:
     log:
         "",
     params:
-        proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][
-            "proximity_threshold"
+        proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_proximity_threshold"
         ],
     message:
         "Marking regions with variants too close together for {wildcards.sample}"
@@ -51,8 +50,7 @@ rule subset_fixed_snps_from_vcf:
     log:
         "",
     params:
-        AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][
-            "AF_threshold"
+        AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold"
         ],
     message:
         "Subsetting fixed snps from vcf for {wildcards.sample}"
@@ -81,11 +79,9 @@ rule subset_low_confidence_variants_from_vcf:
     log:
         "",
     params:
-        AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][
-            "AF_threshold"
+        AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold"
         ],
-        tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][
-            "tlod_threshold"
+        tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_tlod_threshold"
         ],
     message:
         "Subsetting unfixed snps from vcf for {wildcards.sample}"
@@ -171,8 +167,7 @@ rule mask_fasta_on_depth_from_bam:
     message:
         "Masking low depth regions of {wildcards.sample} consensus genome"
     params:
-        masking_depth=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][
-            "masking_depth"
+        masking_depth=lambda wildcards: SAMPLES[wildcards.sample]["consensus_masking_depth"
         ],
     threads: config["threads"]["bedtools"]
     resources: