Skip to content

Commit

Permalink
style: formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
boasvdp committed Sep 12, 2024
1 parent e6dbab7 commit 513d8ab
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 11 deletions.
7 changes: 4 additions & 3 deletions workflow/rules/mtb_prepare_files.smk
Original file line number Diff line number Diff line change
Expand Up @@ -45,24 +45,25 @@ rule copy_sample_vcf:
cp {input.vcf} {output.vcf} 2>&1>{log}
"""


# This rule adds VCF entries needed for lineage typing to the normal VCF
# The VCF entries are only added if missing and contain only reference calls (0/0 Genotype)
# fast-lineage-caller only considers variants in the VCF so these need to be added explicitly
rule prepare_lineage_typing_vcf:
input:
lineage4_vcf="files/mtb/lineage4_lipworth.vcf",
vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf"
vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf",
output:
vcf=temp(OUT + "/mtb_typing/prepared_files/lineage_typing_vcf/{sample}.vcf"),
conda:
"../envs/gatk_picard.yaml"
container:
"docker://staphb/bedtools:2.30.0"
log:
OUT + "/log/prepare_lineage_typing_vcf/{sample}.log"
OUT + "/log/prepare_lineage_typing_vcf/{sample}.log",
threads: config["threads"]["bedtools"]
resources:
mem_gb=config["mem_gb"]["bedtools"]
mem_gb=config["mem_gb"]["bedtools"],
shell:
"""
cat {input.vcf} <(bedtools intersect -a {input.lineage4_vcf} -b {input.vcf} -v) | \
Expand Down
6 changes: 4 additions & 2 deletions workflow/rules/mtb_typing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ fast-lineage-caller \
2>&1>{log}
"""


rule mtb_lineage_id_custom:
input:
vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf",
Expand All @@ -40,7 +41,7 @@ rule mtb_lineage_id_custom:
resources:
mem_gb=config["mem_gb"]["fast-lineage-caller"],
params:
scheme = "files/mtb/snpCL_scheme.tsv",
scheme="files/mtb/snpCL_scheme.tsv",
shell:
"""
fast-lineage-caller \
Expand All @@ -50,13 +51,14 @@ fast-lineage-caller \
2>&1>{log}
"""


rule combine_lineage_typing:
input:
lineage_standard=OUT + "/mtb_typing/lineage_call_standard/{sample}.tsv",
lineage_custom=OUT + "/mtb_typing/lineage_call_custom/{sample}.tsv",
output:
tsv=OUT + "/mtb_typing/lineage_call/{sample}.tsv",
conda:
conda:
"../envs/scripts.yaml"
log:
OUT + "/log/combine_lineage_typing/{sample}.log",
Expand Down
34 changes: 28 additions & 6 deletions workflow/scripts/combine_lineage_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,18 @@
import re

# define lineage4 sublineages from PMID 30789126
lineage4_sublineages = ["ghana", "xtype", "haarlem", "ural", "tur", "lam", "stype", "uganda", "cameroon"]
lineage4_sublineages = [
"ghana",
"xtype",
"haarlem",
"ural",
"tur",
"lam",
"stype",
"uganda",
"cameroon",
]


def parse_str_to_dict(lineage_str):
lineages = lineage_str.split(",")
Expand All @@ -21,6 +32,7 @@ def parse_str_to_dict(lineage_str):
parsed_data[lineage_name] = (snp_counts, snp_total, snp_ratio)
return parsed_data


def get_highest_ratio(lineage_dict):
if len(lineage_dict) == 1:
return list(lineage_dict.keys())
Expand All @@ -30,24 +42,34 @@ def get_highest_ratio(lineage_dict):
# get the highest ratio
max_ratio = max([lineage_dict[lineage][2] for lineage in lineage_dict.keys()])
# get the lineage(s) with the highest ratio
max_lineages = [lineage for lineage in lineage_dict.keys() if lineage_dict[lineage][2] == max_ratio]
max_lineages = [
lineage
for lineage in lineage_dict.keys()
if lineage_dict[lineage][2] == max_ratio
]
return max_lineages


def decide_type(lineage_dict, sublineages):
output = []
max_lineages = get_highest_ratio(lineage_dict)
for lineage in max_lineages:
output.append(f"{lineage}({lineage_dict[lineage][0]}/{lineage_dict[lineage][1]})")
output.append(
f"{lineage}({lineage_dict[lineage][0]}/{lineage_dict[lineage][1]})"
)
if lineage == "lineage4":
# find the next highest after lineage4
lineage_dict_copy = lineage_dict.copy()
lineage_dict_copy.pop("lineage4")
max_lineages_without_lineage4 = get_highest_ratio(lineage_dict_copy)
for sublineage in max_lineages_without_lineage4:
if sublineage in sublineages:
output.append(f"{sublineage}({lineage_dict[sublineage][0]}/{lineage_dict[sublineage][1]})")
output.append(
f"{sublineage}({lineage_dict[sublineage][0]}/{lineage_dict[sublineage][1]})"
)
return ",".join(sorted(output))


def parse_value(value):
"""
Parse a column from fast-lineage-caller
Expand All @@ -72,6 +94,7 @@ def parse_value(value):
top_hit = decide_type(parsed_lineages, lineage4_sublineages)
return top_hit


def parse_counts(df_input):
df = df_input.copy()
df.columns = [f"{col}_counts" if col != "Isolate" else col for col in df.columns]
Expand All @@ -94,7 +117,6 @@ def main(args):

df_custom = pd.read_csv(args.custom, sep="\t")


# merge all dataframes horizontally on Isolate
merged_df = pd.merge(df_standard_parsed, df_custom, on="Isolate")

Expand All @@ -112,4 +134,4 @@ def main(args):

args = parser.parse_args()

main(args)
main(args)

0 comments on commit 513d8ab

Please sign in to comment.