Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Screen deletions #5

Merged
merged 3 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,9 @@ localrules:
aggregate_species,
copy_sample_bam,
copy_sample_vcf,
copy_deletion_vcf,
copy_ref,
prepare_ab_table,
generate_ab_table_header,
mtb_filter_res_table_positions,
postprocess_variant_table,
mtb_make_json,
audit_version_gatk,
audit_version_biopython,
Expand Down
19 changes: 9 additions & 10 deletions config/presets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,19 @@ mycobacterium_tuberculosis:
resistance_variants_csv: /mnt/db/juno/variant-typing/mtb/reslist.csv
resistance_variants_columns: ref_reslist,alt_reslist,gene,drug,RIVM,TP,FP,confidence,variant_common_name
resistance_variants_ab_column: drug
consensus:
proximity_threshold: 12
AF_threshold: 0.8
tlod_threshold: 10
masking_depth: 10
resistance_deletions_bed: files/mtb/resistance_regions_tbdb.bed
consensus_proximity_threshold: 12
consensus_AF_threshold: 0.8
consensus_tlod_threshold: 10
consensus_masking_depth: 10
other:
single_copy_bed: null
count_mutations_bed: null
reference_genbank: null
resistance_variants_csv: null
resistance_variants_columns: null
resistance_variants_ab_column: null
consensus:
proximity_threshold: null
AF_threshold: null
tlod_threshold: null
masking_depth: null
consensus_proximity_threshold: null
consensus_AF_threshold: null
consensus_tlod_threshold: null
consensus_masking_depth: null
73 changes: 73 additions & 0 deletions files/mtb/resistance_regions_tbdb.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
NC_000962.3 1 1524 Rv0001 dnaA isoniazid
NC_000962.3 4933 7267 Rv0005 gyrB levofloxacin,moxifloxacin
NC_000962.3 7068 9818 Rv0006 gyrA levofloxacin,moxifloxacin
NC_000962.3 13133 13911 Rv0010c Rv0010c isoniazid
NC_000962.3 490545 491793 Rv0407 fgd1 clofazimine,pretomanid,delamanid
NC_000962.3 574479 576790 Rv0486 mshA ethionamide,isoniazid
NC_000962.3 619500 620865 Rv0529 ccsA capreomycin,amikacin,kanamycin
NC_000962.3 656010 657739 Rv0565c Rv0565c ethionamide
NC_000962.3 731680 732406 Rv0635 hadA isoniazid
NC_000962.3 733853 734970 Rv0639 nusG rifampicin
NC_000962.3 759342 763325 Rv0667 rpoB rifampicin
NC_000962.3 763127 767320 Rv0668 rpoC rifampicin
NC_000962.3 775586 778680 Rv0676c mmpL5 clofazimine,bedaquiline
NC_000962.3 778477 779624 Rv0677c mmpS5 clofazimine,bedaquiline
NC_000962.3 778790 779487 Rv0678 mmpR5 clofazimine,bedaquiline
NC_000962.3 781126 781934 Rv0682 rpsL streptomycin
NC_000962.3 800106 801462 Rv0701 rplC linezolid
NC_000962.3 1253074 1254783 Rv1129c Rv1129c moxifloxacin,levofloxacin,rifampicin,isoniazid
NC_000962.3 1302606 1305501 Rv1173 fbiC clofazimine,pretomanid,delamanid
NC_000962.3 1364162 1365186 Rv1221 sigE pyrazinamide
NC_000962.3 1406081 1407604 Rv1258c Rv1258c streptomycin,isoniazid,pyrazinamide
NC_000962.3 1416181 1418048 Rv1267c embR ethambutol
NC_000962.3 1460802 1461290 Rv1305 atpE bedaquiline
NC_000962.3 1471498 1473382 EBG00000313325 rrs streptomycin,kanamycin,capreomycin,amikacin
NC_000962.3 1473408 1476795 EBG00000313339 rrl capreomycin,linezolid
NC_000962.3 1673148 1674183 Rv1483 fabG1 ethionamide,isoniazid
NC_000962.3 1673189 1675011 Rv1484 inhA ethionamide,isoniazid
NC_000962.3 1833247 1834987 Rv1630 rpsA pyrazinamide
NC_000962.3 1853358 1854388 Rv1644 tsnR linezolid
NC_000962.3 1917506 1918746 Rv1694 tlyA capreomycin
NC_000962.3 2062809 2065010 Rv1819c bacA streptomycin,kanamycin,capreomycin,amikacin
NC_000962.3 2101651 2103337 Rv1854c ndh ethionamide,delamanid,isoniazid
NC_000962.3 2153889 2156842 Rv1908c katG isoniazid
NC_000962.3 2167649 2170934 Rv1918c PPE35 pyrazinamide
NC_000962.3 2221719 2223825 Rv1979c Rv1979c clofazimine,bedaquiline
NC_000962.3 2288681 2290323 Rv2043c pncA pyrazinamide
NC_000962.3 2517915 2519365 Rv2245 kasA isoniazid
NC_000962.3 2714124 2715832 Rv2416c eis amikacin,kanamycin
NC_000962.3 2725899 2726780 Rv2428 ahpC isoniazid
NC_000962.3 2746135 2747798 Rv2447c folC para-aminosalicylic_acid
NC_000962.3 2782366 2786169 Rv2477c Rv2477c ethambutol,moxifloxacin,streptomycin,levofloxacin,kanamycin,rifampicin,amikacin
NC_000962.3 2859300 2860640 Rv2535c pepQ clofazimine,bedaquiline
NC_000962.3 2986639 2987615 Rv2671 ribD para-aminosalicylic_acid
NC_000962.3 2995772 2996737 Rv2680 Rv2680 capreomycin
NC_000962.3 2996539 2998055 Rv2681 Rv2681 capreomycin
NC_000962.3 3064515 3067372 Rv2752c Rv2752c ethambutol,moxifloxacin,levofloxacin,rifampicin,isoniazid
NC_000962.3 3067193 3068161 Rv2754c thyX para-aminosalicylic_acid
NC_000962.3 3073680 3074671 Rv2764c thyA para-aminosalicylic_acid
NC_000962.3 3086620 3087935 Rv2780 ald cycloserine
NC_000962.3 3338868 3339762 Rv2983 fbiD clofazimine,pretomanid,delamanid
NC_000962.3 3448253 3449991 Rv3083 Rv3083 ethionamide
NC_000962.3 3568401 3569280 Rv3197A whiB7 streptomycin,amikacin,kanamycin
NC_000962.3 3611959 3613847 Rv3236c Rv3236c pyrazinamide
NC_000962.3 3623159 3625110 Rv3244c lpqB bedaquiline,rifampicin
NC_000962.3 3624910 3626860 Rv3245c mtrB bedaquiline,rifampicin
NC_000962.3 3626663 3627924 Rv3246c mtrA bedaquiline,rifampicin
NC_000962.3 3640207 3641538 Rv3261 fbiA clofazimine,pretomanid,delamanid
NC_000962.3 3641335 3642881 Rv3262 fbiB clofazimine,pretomanid,delamanid
NC_000962.3 3840194 3841620 Rv3423c alr cycloserine
NC_000962.3 3877464 3879240 Rv3457c rpoA rifampicin
NC_000962.3 3986612 3987299 Rv3547 ddn delamanid,pretomanid
NC_000962.3 4038158 4041013 Rv3596c clpC1 pyrazinamide
NC_000962.3 4043862 4046428 Rv3601c panD pyrazinamide
NC_000962.3 4138202 4140002 Rv3696c glpK ethambutol,moxifloxacin,streptomycin,levofloxacin,rifampicin,isoniazid
NC_000962.3 4237683 4243147 Rv3793 embC ethambutol
NC_000962.3 4242947 4246517 Rv3794 embA ethambutol
NC_000962.3 4246314 4249810 Rv3795 embB ethambutol
NC_000962.3 4266953 4269124 Rv3805c aftB ethambutol
NC_000962.3 4268925 4270084 Rv3806c ubiA ethambutol
NC_000962.3 4326004 4330174 Rv3854c ethA ethionamide
NC_000962.3 4327328 4328199 Rv3855 ethR ethionamide
NC_000962.3 4338171 4338961 Rv3862c whiB6 capreomycin,amikacin,kanamycin
NC_000962.3 4407528 4408481 Rv3919c gid streptomycin
1 change: 1 addition & 0 deletions workflow/rules/choose_species.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ def choose_species(wildcards):
OUT + "/mtb_typing/contamination_check/rrs_rrl_contamination/{sample}.tsv",
OUT + "/mtb_typing/seq_exp_json/{sample}.json",
OUT + "/mtb_typing/consensus/{sample}.fasta",
OUT + "/mtb_typing/annotated_deletions/{sample}.tsv",
]
else:
return OUT + "/typing_check/{sample}/no_typing_necessary.txt"
Expand Down
15 changes: 5 additions & 10 deletions workflow/rules/consensus.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ rule mark_variants_by_proximity:
log:
"",
params:
proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][
"proximity_threshold"
proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_proximity_threshold"
],
message:
"Marking regions with variants too close together for {wildcards.sample}"
Expand All @@ -51,8 +50,7 @@ rule subset_fixed_snps_from_vcf:
log:
"",
params:
AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][
"AF_threshold"
AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold"
],
message:
"Subsetting fixed snps from vcf for {wildcards.sample}"
Expand Down Expand Up @@ -81,11 +79,9 @@ rule subset_low_confidence_variants_from_vcf:
log:
"",
params:
AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][
"AF_threshold"
AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold"
],
tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][
"tlod_threshold"
tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_tlod_threshold"
],
message:
"Subsetting unfixed snps from vcf for {wildcards.sample}"
Expand Down Expand Up @@ -171,8 +167,7 @@ rule mask_fasta_on_depth_from_bam:
message:
"Masking low depth regions of {wildcards.sample} consensus genome"
params:
masking_depth=lambda wildcards: SAMPLES[wildcards.sample]["consensus"][
"masking_depth"
masking_depth=lambda wildcards: SAMPLES[wildcards.sample]["consensus_masking_depth"
],
threads: config["threads"]["bedtools"]
resources:
Expand Down
18 changes: 18 additions & 0 deletions workflow/rules/mtb_prepare_files.smk
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,21 @@ DB_NAME=$(basename {input.db_dir})
cd $WORKDIR
$EXEC build -genbank -v $DB_NAME -config $CONFIG_NAME -dataDir . 2>&1>{log}
"""


rule copy_deletion_vcf:
input:
vcf=INPUT + "/deletions/{sample}.vcf",
output:
vcf=OUT + "/mtb_typing/prepared_files/deletions/{sample}.vcf",
log:
OUT + "/log/copy_deletion_vcf/{sample}.log",
message:
"Copying deletion vcf for {wildcards.sample}"
threads: config["threads"]["bcftools"]
resources:
mem_gb=config["mem_gb"]["bcftools"],
shell:
"""
cp {input.vcf} {output.vcf} 2>&1>{log}
"""
53 changes: 53 additions & 0 deletions workflow/rules/mtb_typing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,59 @@ python workflow/scripts/create_tb_json.py \
"""


rule mtb_deletions_to_table:
input:
vcf=OUT + "/mtb_typing/prepared_files/deletions/{sample}.vcf",
output:
OUT + "/mtb_typing/annotated_deletions/raw/{sample}.tsv",
conda:
"../envs/gatk_picard.yaml"
container:
"docker://broadinstitute/gatk:4.3.0.0"
log:
OUT + "/log/mtb_deletions_to_table/{sample}.log",
message:
"Convert deletion vcf to table for {wildcards.sample}"
threads: config["threads"]["gatk"]
resources:
mem_gb=config["mem_gb"]["gatk"],
shell:
"""
gatk VariantsToTable \
-V {input.vcf} \
--show-filtered \
-F CHROM \
-F POS \
-F END \
-O {output} 2>&1>{log}
"""


rule mtb_annotate_deletions:
input:
bed=OUT + "/mtb_typing/annotated_deletions/raw/{sample}.tsv",
resistance_deletions_bed=lambda wildcards: SAMPLES[wildcards.sample][
"resistance_deletions_bed"
],
output:
tsv=OUT + "/mtb_typing/annotated_deletions/{sample}.tsv",
log:
OUT + "/log/mtb_annotate_deletions/{sample}.log",
message:
"Annotating deletions with AMR for {wildcards.sample}"
threads: config["threads"]["other"]
resources:
mem_gb=config["mem_gb"]["other"],
shell:
"""
python workflow/scripts/postprocess_deletion_table.py \
--input {input.bed} \
--bed {input.resistance_deletions_bed} \
--output {output} \
2>&1>{log}
"""


module consensus_workflow:
config:
config
Expand Down
74 changes: 74 additions & 0 deletions workflow/scripts/postprocess_deletion_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env python3

import pandas as pd


def check_deletions_in_bed(deletions, bed):
"""
Check if deletions are in the reference bed file

Parameters
----------
deletions : pd.DataFrame
Deletions table, should contain the columns 'CHROM', 'POS', 'END' and 'interval'
bed : pd.DataFrame
Bed file, should contain the columns 'chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs' and 'interval'

Returns
-------
pd.DataFrame
Annotated deletions table, containing the columns 'chrom', 'start', 'end', 'locus_tag', 'gene' and 'drugs'

Notes
-----
Compares genomic ranges stored as pd.Interval objects, with closed bounds

"""
deletions_annotated = []
for index, row in deletions.iterrows():
for index2, row2 in bed.iterrows():
if row['CHROM'] == row2['chrom']:
if row['interval'].overlaps(row2['interval']):
dictionary_overlap = {
'chrom': row['CHROM'],
'start': row['POS'],
'end': row['END'],
'locus_tag': row2['locus_tag'],
'gene': row2['gene'],
'drugs': row2['drugs']
}
deletions_annotated.append(dictionary_overlap)

return pd.DataFrame(deletions_annotated)


def main(args):
# Read the deletion table
deletions = pd.read_csv(args.input, sep='\t', header=0)

if deletions.shape[0] == 0:
with open(args.output, 'w') as f:
f.write('chrom\tstart\tend\tlocus_tag\tgene\tdrugs\n')

else:
deletions['interval'] = deletions.apply(lambda x: pd.Interval(x['POS'], x['END'], closed='both'), axis=1)

# Read the reference bed file
bed = pd.read_csv(args.bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs'])
bed['interval'] = bed.apply(lambda x: pd.Interval(x['start'], x['end'], closed='both'), axis=1)

annotated_deletions = check_deletions_in_bed(deletions, bed)

# Write the annotatetd deletion table
annotated_deletions.to_csv(args.output, sep='\t', index=False)

if __name__ == '__main__':
import argparse

parser = argparse.ArgumentParser(description='Filter deletions that are not in the reference genome')
parser.add_argument('--input', help='Deletion table', required=True)
parser.add_argument('--bed', help='Filtered deletion table', required=True)
parser.add_argument('--output', help='Output file', default='deletions_annotated.tsv')
args = parser.parse_args()

main(args)
Loading