style: formatting

RIVM-bioinformatics · Sep 12, 2024 · 513d8ab · 513d8ab
1 parent e6dbab7
commit 513d8ab
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 11 deletions.
diff --git a/workflow/rules/mtb_prepare_files.smk b/workflow/rules/mtb_prepare_files.smk
@@ -45,24 +45,25 @@ rule copy_sample_vcf:
 cp {input.vcf} {output.vcf} 2>&1>{log}
         """
 
+
 # This rule adds VCF entries needed for lineage typing to the normal VCF
 # The VCF entries are only added if missing and contain only reference calls (0/0 Genotype)
 # fast-lineage-caller only considers variants in the VCF so these need to be added explicitly
 rule prepare_lineage_typing_vcf:
     input:
         lineage4_vcf="files/mtb/lineage4_lipworth.vcf",
-        vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf"
+        vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf",
     output:
         vcf=temp(OUT + "/mtb_typing/prepared_files/lineage_typing_vcf/{sample}.vcf"),
     conda:
         "../envs/gatk_picard.yaml"
     container:
         "docker://staphb/bedtools:2.30.0"
     log:
-        OUT + "/log/prepare_lineage_typing_vcf/{sample}.log"
+        OUT + "/log/prepare_lineage_typing_vcf/{sample}.log",
     threads: config["threads"]["bedtools"]
     resources:
-        mem_gb=config["mem_gb"]["bedtools"]
+        mem_gb=config["mem_gb"]["bedtools"],
     shell:
         """
 cat {input.vcf} <(bedtools intersect -a {input.lineage4_vcf} -b {input.vcf} -v) | \

diff --git a/workflow/rules/mtb_typing.smk b/workflow/rules/mtb_typing.smk
@@ -23,6 +23,7 @@ fast-lineage-caller \
 2>&1>{log}
         """
 
+
 rule mtb_lineage_id_custom:
     input:
         vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf",
@@ -40,7 +41,7 @@ rule mtb_lineage_id_custom:
     resources:
         mem_gb=config["mem_gb"]["fast-lineage-caller"],
     params:
-        scheme = "files/mtb/snpCL_scheme.tsv",
+        scheme="files/mtb/snpCL_scheme.tsv",
     shell:
         """
 fast-lineage-caller \
@@ -50,13 +51,14 @@ fast-lineage-caller \
 2>&1>{log}
         """
 
+
 rule combine_lineage_typing:
     input:
         lineage_standard=OUT + "/mtb_typing/lineage_call_standard/{sample}.tsv",
         lineage_custom=OUT + "/mtb_typing/lineage_call_custom/{sample}.tsv",
     output:
         tsv=OUT + "/mtb_typing/lineage_call/{sample}.tsv",
-    conda:  
+    conda:
         "../envs/scripts.yaml"
     log:
         OUT + "/log/combine_lineage_typing/{sample}.log",

diff --git a/workflow/scripts/combine_lineage_calls.py b/workflow/scripts/combine_lineage_calls.py
@@ -5,7 +5,18 @@
 import re
 
 # define lineage4 sublineages from PMID 30789126
-lineage4_sublineages = ["ghana", "xtype", "haarlem", "ural", "tur", "lam", "stype", "uganda", "cameroon"]
+lineage4_sublineages = [
+    "ghana",
+    "xtype",
+    "haarlem",
+    "ural",
+    "tur",
+    "lam",
+    "stype",
+    "uganda",
+    "cameroon",
+]
+
 
 def parse_str_to_dict(lineage_str):
     lineages = lineage_str.split(",")
@@ -21,6 +32,7 @@ def parse_str_to_dict(lineage_str):
         parsed_data[lineage_name] = (snp_counts, snp_total, snp_ratio)
     return parsed_data
 
+
 def get_highest_ratio(lineage_dict):
     if len(lineage_dict) == 1:
         return list(lineage_dict.keys())
@@ -30,24 +42,34 @@ def get_highest_ratio(lineage_dict):
         # get the highest ratio
         max_ratio = max([lineage_dict[lineage][2] for lineage in lineage_dict.keys()])
         # get the lineage(s) with the highest ratio
-        max_lineages = [lineage for lineage in lineage_dict.keys() if lineage_dict[lineage][2] == max_ratio]
+        max_lineages = [
+            lineage
+            for lineage in lineage_dict.keys()
+            if lineage_dict[lineage][2] == max_ratio
+        ]
         return max_lineages
 
+
 def decide_type(lineage_dict, sublineages):
     output = []
     max_lineages = get_highest_ratio(lineage_dict)
     for lineage in max_lineages:
-        output.append(f"{lineage}({lineage_dict[lineage][0]}/{lineage_dict[lineage][1]})")
+        output.append(
+            f"{lineage}({lineage_dict[lineage][0]}/{lineage_dict[lineage][1]})"
+        )
         if lineage == "lineage4":
             # find the next highest after lineage4
             lineage_dict_copy = lineage_dict.copy()
             lineage_dict_copy.pop("lineage4")
             max_lineages_without_lineage4 = get_highest_ratio(lineage_dict_copy)
             for sublineage in max_lineages_without_lineage4:
                 if sublineage in sublineages:
-                    output.append(f"{sublineage}({lineage_dict[sublineage][0]}/{lineage_dict[sublineage][1]})")
+                    output.append(
+                        f"{sublineage}({lineage_dict[sublineage][0]}/{lineage_dict[sublineage][1]})"
+                    )
     return ",".join(sorted(output))
 
+
 def parse_value(value):
     """
     Parse a column from fast-lineage-caller
@@ -72,6 +94,7 @@ def parse_value(value):
         top_hit = decide_type(parsed_lineages, lineage4_sublineages)
     return top_hit
 
+
 def parse_counts(df_input):
     df = df_input.copy()
     df.columns = [f"{col}_counts" if col != "Isolate" else col for col in df.columns]
@@ -94,7 +117,6 @@ def main(args):
 
     df_custom = pd.read_csv(args.custom, sep="\t")
 
-
     # merge all dataframes horizontally on Isolate
     merged_df = pd.merge(df_standard_parsed, df_custom, on="Isolate")
 
@@ -112,4 +134,4 @@ def main(args):
 
     args = parser.parse_args()
 
-    main(args)
+    main(args)