From 2d075d7a6bc8897425f3d28fc8047ccbff8e65bb Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Fri, 26 Jul 2024 16:59:12 +0200
Subject: [PATCH 1/9] feat: add detection of snpCLs

---
 Snakefile                                 |  1 +
 files/mtb/snpCL_scheme.tsv                | 31 +++++++++++++++
 workflow/envs/scripts.yaml                |  4 ++
 workflow/rules/mtb_typing.smk             | 48 ++++++++++++++++++++++-
 workflow/scripts/combine_lineage_calls.py | 26 ++++++++++++
 5 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 files/mtb/snpCL_scheme.tsv
 create mode 100644 workflow/envs/scripts.yaml
 create mode 100644 workflow/scripts/combine_lineage_calls.py

diff --git a/Snakefile b/Snakefile
index 8719d8e..4c6efff 100644
--- a/Snakefile
+++ b/Snakefile
@@ -26,6 +26,7 @@ localrules:
     copy_sample_vcf,
     copy_deletion_vcf,
     copy_ref,
+    combine_lineage_typing,
     mtb_filter_res_table_positions,
     mtb_make_json,
     audit_version_gatk,
diff --git a/files/mtb/snpCL_scheme.tsv b/files/mtb/snpCL_scheme.tsv
new file mode 100644
index 0000000..0a7fe4c
--- /dev/null
+++ b/files/mtb/snpCL_scheme.tsv
@@ -0,0 +1,31 @@
+#lineage	position	allele_change	tag
+snpCL_1	1157317	C/T	EUSeqMyTB_snpCL
+snpCL_2	4327088	A/C	EUSeqMyTB_snpCL
+snpCL_3	130881	C/G	EUSeqMyTB_snpCL
+snpCL_4	1485300	C/G	EUSeqMyTB_snpCL
+snpCL_5	1208477	C/G	EUSeqMyTB_snpCL
+snpCL_6	1547828	G/C	EUSeqMyTB_snpCL
+snpCL_7	2971513	C/G	EUSeqMyTB_snpCL
+snpCL_8	1039921	A/G	EUSeqMyTB_snpCL
+snpCL_10	211015	G/A	EUSeqMyTB_snpCL
+snpCL_11	1895566	C/G	EUSeqMyTB_snpCL
+snpCL_12	1914071	G/A	EUSeqMyTB_snpCL
+snpCL_13	1810244	G/A	EUSeqMyTB_snpCL
+snpCL_14	4155977	G/A	EUSeqMyTB_snpCL
+snpCL_15	2399093	G/A	EUSeqMyTB_snpCL
+snpCL_16	1008074	G/A	EUSeqMyTB_snpCL
+snpCL_17	4284172	C/T	EUSeqMyTB_snpCL
+snpCL_18	766707	A/G	EUSeqMyTB_snpCL
+snpCL_19	3169491	C/A	EUSeqMyTB_snpCL
+snpCL_20	996197	A/C	EUSeqMyTB_snpCL
+snpCL_22	1231934	T/G	EUSeqMyTB_snpCL
+snpCL_23	1398622	G/A	EUSeqMyTB_snpCL
+snpCL_25	1068731	G/C	EUSeqMyTB_snpCL
+snpCL_27	3031515	T/C	EUSeqMyTB_snpCL
+snpCL_28	1028437	C/T	EUSeqMyTB_snpCL
+snpCL_29	3640351	C/T	EUSeqMyTB_snpCL
+snpCL_30	3223901	G/A	EUSeqMyTB_snpCL
+snpCL_31	1946519	C/T	EUSeqMyTB_snpCL
+snpCL_32	3088899	G/A	EUSeqMyTB_snpCL
+snpCL_33	1921877	G/A	EUSeqMyTB_snpCL
+snpCL_34	1760095	C/G	EUSeqMyTB_snpCL
diff --git a/workflow/envs/scripts.yaml b/workflow/envs/scripts.yaml
new file mode 100644
index 0000000..8d303cc
--- /dev/null
+++ b/workflow/envs/scripts.yaml
@@ -0,0 +1,4 @@
+channels:
+- conda-forge
+dependencies:
+- pandas=2.2.*
\ No newline at end of file
diff --git a/workflow/rules/mtb_typing.smk b/workflow/rules/mtb_typing.smk
index bac3849..1528755 100644
--- a/workflow/rules/mtb_typing.smk
+++ b/workflow/rules/mtb_typing.smk
@@ -2,7 +2,7 @@ rule mtb_lineage_id:
     input:
         vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf",
     output:
-        tsv=OUT + "/mtb_typing/lineage_call/{sample}.tsv",
+        tsv=temp(OUT + "/mtb_typing/lineage_call_standard/{sample}.tsv"),
     conda:
         "../envs/fast_lineage_caller.yaml"
     container:
@@ -22,6 +22,52 @@ fast-lineage-caller \
 2>&1>{log}
         """
 
+rule mtb_lineage_id_custom:
+    input:
+        vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf",
+    output:
+        tsv=temp(OUT + "/mtb_typing/lineage_call_custom/{sample}.tsv"),
+    conda:
+        "../envs/fast_lineage_caller.yaml"
+    container:
+        "docker://ghcr.io/boasvdp/fast_lineage_caller:1.0.0"
+    log:
+        OUT + "/log/mtb_lineage_id/{sample}.log",
+    message:
+        "Typing Mtb lineage for {wildcards.sample}"
+    threads: config["threads"]["fast-lineage-caller"]
+    resources:
+        mem_gb=config["mem_gb"]["fast-lineage-caller"],
+    params:
+        scheme = "files/mtb/snpCL_scheme.tsv",
+    shell:
+        """
+fast-lineage-caller \
+--out {output.tsv} \
+--scheme {params.scheme} \
+{input.vcf} \
+2>&1>{log}
+        """
+
+rule combine_lineage_typing:
+    input:
+        lineage_standard=OUT + "/mtb_typing/lineage_call_standard/{sample}.tsv",
+        lineage_custom=OUT + "/mtb_typing/lineage_call_custom/{sample}.tsv",
+    output:
+        tsv=OUT + "/mtb_typing/lineage_call/{sample}.tsv",
+    conda:  
+        "../envs/scripts.yaml"
+    log:
+        OUT + "/log/combine_lineage_typing/{sample}.log",
+    message:
+        "Combining lineage calls for {wildcards.sample}"
+    shell:
+        """
+python workflow/scripts/combine_lineage_calls.py \
+{input.lineage_standard} {input.lineage_custom} \
+--output {output.tsv} 2>&1>{log}
+        """
+
 
 rule mtb_coll_contamination:
     input:
diff --git a/workflow/scripts/combine_lineage_calls.py b/workflow/scripts/combine_lineage_calls.py
new file mode 100644
index 0000000..50d411a
--- /dev/null
+++ b/workflow/scripts/combine_lineage_calls.py
@@ -0,0 +1,26 @@
+# merge two dataframes and assert a single row is left
+
+import argparse
+import pandas as pd
+
+parser = argparse.ArgumentParser(description="Combine lineage calls")
+
+parser.add_argument("lineage_calls", help="Path to lineage calls", nargs="+")
+parser.add_argument("--output", help="Path to output file")
+
+args = parser.parse_args()
+
+# read in all lineage calls
+dfs = []
+for lineage_call in args.lineage_calls:
+    df = pd.read_csv(lineage_call, sep="\t")
+    dfs.append(df)
+
+# merge all dataframes horizontally on Isolate
+merged_df = dfs[0]
+for df in dfs[1:]:
+    merged_df = pd.merge(merged_df, df, on="Isolate")
+
+assert merged_df.shape[0] == 1
+
+merged_df.to_csv(args.output, sep="\t", index=False)

From fbd67e63bf67a7ae8f010c29a3df97626b43ba8f Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Fri, 26 Jul 2024 16:59:26 +0200
Subject: [PATCH 2/9] deps: update juno-lib

---
 envs/juno_variant_typing.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/envs/juno_variant_typing.yaml b/envs/juno_variant_typing.yaml
index 4477df6..36b2103 100644
--- a/envs/juno_variant_typing.yaml
+++ b/envs/juno_variant_typing.yaml
@@ -12,4 +12,4 @@ dependencies:
   - pip=23.*
   - python=3.11.*
   - pip:
-    - "--editable=git+https://github.com/RIVM-bioinformatics/juno-library.git@v2.2.0#egg=juno_library"
+    - "--editable=git+https://github.com/RIVM-bioinformatics/juno-library.git@v2.2.1#egg=juno_library"

From 5d5991adaf45cc7a19a9bf9010170fae928b801f Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Fri, 26 Jul 2024 16:59:48 +0200
Subject: [PATCH 3/9] fix: generalize binding paths when using container

---
 juno_variant_typing.py | 53 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/juno_variant_typing.py b/juno_variant_typing.py
index c688a1b..8d5ab0f 100644
--- a/juno_variant_typing.py
+++ b/juno_variant_typing.py
@@ -138,11 +138,27 @@ def setup(self) -> None:
         self.set_presets()
 
         if self.snakemake_args["use_singularity"]:
+
+            paths_from_presets = []
+
+            for key, value in self.species_presets.items():
+                try:
+                    path = Path(value)
+                except TypeError:
+                    continue
+                if path.exists() & path.is_absolute():
+                    paths_from_presets.append(path.parent.resolve())
+
+            unique_bind_paths = self.get_unique_bind_paths(
+                paths_from_presets, iterations=3
+            )
+
             self.snakemake_args["singularity_args"] = " ".join(
                 [
                     self.snakemake_args["singularity_args"],
                     f"--bind {self.db_dir}:{self.db_dir}",
-                ]  # paths that singularity should be able to read from can be bound by adding to the above list
+                ]
+                + [f"--bind {str(path)}:{str(path)}" for path in unique_bind_paths]
             )
 
         # # Extra class methods for this pipeline can be invoked here
@@ -188,18 +204,51 @@ def update_sample_dict_with_metadata(self) -> None:
                 )
 
     def set_presets(self) -> None:
+        # if no custom presets were provided, look in default location
         if self.presets_path is None:
             self.presets_path = Path(__file__).parent.joinpath("config/presets.yaml")
 
+        # read all presets into dict
         with open(self.presets_path) as f:
             presets_dict = yaml.safe_load(f)
 
+        # update sample dict with presets
         for sample in self.sample_dict:
             species_name = self.sample_dict[sample]["species"]
             if species_name in presets_dict.keys():
-                for key, value in presets_dict[species_name].items():
+                # store species-specific presets in self.species_presets for potential reuse
+                self.species_presets = presets_dict[species_name]
+                for key, value in self.species_presets.items():
                     self.sample_dict[sample][key] = value
 
+    def remove_from_list(self, lst, elements):
+        for element in elements:
+            if element in lst:
+                lst.remove(element)
+        return lst
+
+    def simplify_bind_paths(self, paths):
+        unique_bind_paths = paths.copy()
+        for path1 in paths:
+            for path2 in paths:
+                if path1 == path2:
+                    continue
+                elif path1 == path2.parent:
+                    self.remove_from_list(unique_bind_paths, [path2])
+                elif path1.parent == path2.parent:
+                    self.remove_from_list(unique_bind_paths, [path1, path2])
+                    unique_bind_paths.append(path1.parent)
+                else:
+                    continue
+
+        return unique_bind_paths
+
+    def get_unique_bind_paths(self, paths, iterations=3):
+        unique_bind_paths = paths.copy()
+        for _ in range(iterations):
+            unique_bind_paths = self.simplify_bind_paths(unique_bind_paths)
+        return unique_bind_paths
+
 
 if __name__ == "__main__":
     main()

From d28718d23fbdc2eb619eccd15f5cb7f771b160f0 Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Mon, 19 Aug 2024 13:29:59 +0200
Subject: [PATCH 4/9] fix: add counts to lineage scoring

---
 workflow/rules/mtb_typing.smk             |  4 +-
 workflow/scripts/combine_lineage_calls.py | 84 +++++++++++++++++++----
 2 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/workflow/rules/mtb_typing.smk b/workflow/rules/mtb_typing.smk
index 1528755..e67ecbc 100644
--- a/workflow/rules/mtb_typing.smk
+++ b/workflow/rules/mtb_typing.smk
@@ -17,6 +17,7 @@ rule mtb_lineage_id:
     shell:
         """
 fast-lineage-caller \
+--count \
 --out {output.tsv} \
 {input.vcf} \
 2>&1>{log}
@@ -64,7 +65,8 @@ rule combine_lineage_typing:
     shell:
         """
 python workflow/scripts/combine_lineage_calls.py \
-{input.lineage_standard} {input.lineage_custom} \
+--standard {input.lineage_standard} \
+--custom {input.lineage_custom} \
 --output {output.tsv} 2>&1>{log}
         """
 
diff --git a/workflow/scripts/combine_lineage_calls.py b/workflow/scripts/combine_lineage_calls.py
index 50d411a..4683f47 100644
--- a/workflow/scripts/combine_lineage_calls.py
+++ b/workflow/scripts/combine_lineage_calls.py
@@ -2,25 +2,79 @@
 
 import argparse
 import pandas as pd
+import re
 
-parser = argparse.ArgumentParser(description="Combine lineage calls")
 
-parser.add_argument("lineage_calls", help="Path to lineage calls", nargs="+")
-parser.add_argument("--output", help="Path to output file")
+def parse_value(value):
+    if not isinstance(value, str):
+        outcome = value
+    elif value == "NA":
+        outcome = "NA"
+    elif len(value.split(",")) == 1:
+        # remove string matching pattern "([0-9]+/[0-9]+)" from value
+        outcome = re.sub(r"\([0-9]+/[0-9]+\)", "", value)
+    else:
+        # make a list of identified types
+        list_of_types = value.split(",")
+        
+        max_ratio = -1
+        list_types_with_max_ratio = []
 
-args = parser.parse_args()
+        for item in list_of_types:
+            # get type name by removing count matching pattern "([0-9]+/[0-9]+)"
+            type_name = re.sub(r"\([0-9]+/[0-9]+\)", "", item)
+            # get count matching pattern "([0-9]+/[0-9]+)"
+            count_str = re.search(r"\([0-9]+/[0-9]+\)", item).group()
+            # get first and second number from count_str,
+            # indicating identified nr of SNPs and total nr of SNPs for that type
+            counts = count_str.strip("()").split("/")
+            ratio_type = int(counts[0]) / int(counts[1])
+            if ratio_type > max_ratio:
+                max_ratio = ratio_type
+                list_types_with_max_ratio = [type_name]
+            elif ratio_type == max_ratio:
+                list_types_with_max_ratio.append(type_name)
 
-# read in all lineage calls
-dfs = []
-for lineage_call in args.lineage_calls:
-    df = pd.read_csv(lineage_call, sep="\t")
-    dfs.append(df)
+        # get type with highest ratio
+        outcome = ",".join(list_types_with_max_ratio)
 
-# merge all dataframes horizontally on Isolate
-merged_df = dfs[0]
-for df in dfs[1:]:
-    merged_df = pd.merge(merged_df, df, on="Isolate")
+    return outcome
 
-assert merged_df.shape[0] == 1
+def parse_counts(df_input):
+    df = df_input.copy()
+    df.columns = [f"{col}_counts" if col != "Isolate" else col for col in df.columns]
+    for col in df.columns[1:]:
+        # remove _counts from column name
+        col_name = col.replace("_counts", "")
+        # get string value from first row, column matching col
+        df[col_name] = parse_value(df[col].iloc[0])
+    return df
 
-merged_df.to_csv(args.output, sep="\t", index=False)
+
+def main(args):
+    # read in all lineage calls
+    df_standard = pd.read_csv(args.standard, sep="\t")
+
+    df_standard_parsed = parse_counts(df_standard)
+
+    df_custom = pd.read_csv(args.custom, sep="\t")
+
+
+    # merge all dataframes horizontally on Isolate
+    merged_df = pd.merge(df_standard_parsed, df_custom, on="Isolate")
+
+    assert merged_df.shape[0] == 1
+
+    merged_df.to_csv(args.output, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Combine lineage calls")
+
+    parser.add_argument("--standard", help="Path to standard lineage calls")
+    parser.add_argument("--custom", help="Path to custom lineage calls")
+    parser.add_argument("--output", help="Path to output file")
+
+    args = parser.parse_args()
+
+    main(args)
\ No newline at end of file

From 56ff964ce005f21190177a2e9b30a520a3af44e6 Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Tue, 3 Sep 2024 11:09:33 +0200
Subject: [PATCH 5/9] fix: introduce vcf entries with ref call if needed

---
 files/mtb/lineage4_lipworth.vcf           | 165 ++++++++++++++++++++++
 workflow/rules/mtb_prepare_files.smk      |  25 ++++
 workflow/rules/mtb_typing.smk             |   2 +-
 workflow/scripts/combine_lineage_calls.py |  97 +++++++++----
 4 files changed, 257 insertions(+), 32 deletions(-)
 create mode 100644 files/mtb/lineage4_lipworth.vcf

diff --git a/files/mtb/lineage4_lipworth.vcf b/files/mtb/lineage4_lipworth.vcf
new file mode 100644
index 0000000..78afdc2
--- /dev/null
+++ b/files/mtb/lineage4_lipworth.vcf
@@ -0,0 +1,165 @@
+##fileformat=VCFv4.2
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample
+NC_000962.3	15117	.	C	C	.	.	.	GT	0/0
+NC_000962.3	42281	.	C	C	.	.	.	GT	0/0
+NC_000962.3	70267	.	G	G	.	.	.	GT	0/0
+NC_000962.3	123520	.	T	T	.	.	.	GT	0/0
+NC_000962.3	143207	.	T	T	.	.	.	GT	0/0
+NC_000962.3	206481	.	C	C	.	.	.	GT	0/0
+NC_000962.3	206484	.	G	G	.	.	.	GT	0/0
+NC_000962.3	217201	.	T	T	.	.	.	GT	0/0
+NC_000962.3	249522	.	T	T	.	.	.	GT	0/0
+NC_000962.3	251575	.	G	G	.	.	.	GT	0/0
+NC_000962.3	325505	.	T	T	.	.	.	GT	0/0
+NC_000962.3	342146	.	A	A	.	.	.	GT	0/0
+NC_000962.3	392261	.	T	T	.	.	.	GT	0/0
+NC_000962.3	445780	.	C	C	.	.	.	GT	0/0
+NC_000962.3	491742	.	T	T	.	.	.	GT	0/0
+NC_000962.3	492150	.	G	G	.	.	.	GT	0/0
+NC_000962.3	498531	.	A	A	.	.	.	GT	0/0
+NC_000962.3	505974	.	G	G	.	.	.	GT	0/0
+NC_000962.3	517358	.	T	T	.	.	.	GT	0/0
+NC_000962.3	546357	.	A	A	.	.	.	GT	0/0
+NC_000962.3	555991	.	A	A	.	.	.	GT	0/0
+NC_000962.3	584171	.	T	T	.	.	.	GT	0/0
+NC_000962.3	599868	.	A	A	.	.	.	GT	0/0
+NC_000962.3	648856	.	T	T	.	.	.	GT	0/0
+NC_000962.3	655986	.	T	T	.	.	.	GT	0/0
+NC_000962.3	659341	.	T	T	.	.	.	GT	0/0
+NC_000962.3	662911	.	T	T	.	.	.	GT	0/0
+NC_000962.3	670545	.	G	G	.	.	.	GT	0/0
+NC_000962.3	713310	.	T	T	.	.	.	GT	0/0
+NC_000962.3	757182	.	A	A	.	.	.	GT	0/0
+NC_000962.3	763031	.	T	T	.	.	.	GT	0/0
+NC_000962.3	776100	.	G	G	.	.	.	GT	0/0
+NC_000962.3	820752	.	C	C	.	.	.	GT	0/0
+NC_000962.3	847995	.	T	T	.	.	.	GT	0/0
+NC_000962.3	931123	.	T	T	.	.	.	GT	0/0
+NC_000962.3	932280	.	T	T	.	.	.	GT	0/0
+NC_000962.3	934230	.	C	C	.	.	.	GT	0/0
+NC_000962.3	934611	.	G	G	.	.	.	GT	0/0
+NC_000962.3	941845	.	C	C	.	.	.	GT	0/0
+NC_000962.3	960367	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1024346	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1054784	.	C	C	.	.	.	GT	0/0
+NC_000962.3	1080192	.	G	G	.	.	.	GT	0/0
+NC_000962.3	1098523	.	T	T	.	.	.	GT	0/0
+NC_000962.3	1104690	.	T	T	.	.	.	GT	0/0
+NC_000962.3	1107940	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1144585	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1148259	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1211369	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1230778	.	G	G	.	.	.	GT	0/0
+NC_000962.3	1248382	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1248936	.	G	G	.	.	.	GT	0/0
+NC_000962.3	1250340	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1254562	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1281771	.	T	T	.	.	.	GT	0/0
+NC_000962.3	1351172	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1367484	.	T	T	.	.	.	GT	0/0
+NC_000962.3	1390763	.	C	C	.	.	.	GT	0/0
+NC_000962.3	1479085	.	T	T	.	.	.	GT	0/0
+NC_000962.3	1490905	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1540141	.	T	T	.	.	.	GT	0/0
+NC_000962.3	1544255	.	C	C	.	.	.	GT	0/0
+NC_000962.3	1546703	.	C	C	.	.	.	GT	0/0
+NC_000962.3	1608276	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1618978	.	T	T	.	.	.	GT	0/0
+NC_000962.3	1688300	.	T	T	.	.	.	GT	0/0
+NC_000962.3	1716472	.	A	A	.	.	.	GT	0/0
+NC_000962.3	1839759	.	G	G	.	.	.	GT	0/0
+NC_000962.3	1849609	.	T	T	.	.	.	GT	0/0
+NC_000962.3	1859559	.	C	C	.	.	.	GT	0/0
+NC_000962.3	1931718	.	G	G	.	.	.	GT	0/0
+NC_000962.3	1971725	.	G	G	.	.	.	GT	0/0
+NC_000962.3	2010614	.	G	G	.	.	.	GT	0/0
+NC_000962.3	2050822	.	G	G	.	.	.	GT	0/0
+NC_000962.3	2094913	.	A	A	.	.	.	GT	0/0
+NC_000962.3	2108890	.	A	A	.	.	.	GT	0/0
+NC_000962.3	2122976	.	C	C	.	.	.	GT	0/0
+NC_000962.3	2154724	.	C	C	.	.	.	GT	0/0
+NC_000962.3	2158109	.	T	T	.	.	.	GT	0/0
+NC_000962.3	2161346	.	T	T	.	.	.	GT	0/0
+NC_000962.3	2167926	.	A	A	.	.	.	GT	0/0
+NC_000962.3	2199052	.	C	C	.	.	.	GT	0/0
+NC_000962.3	2209465	.	G	G	.	.	.	GT	0/0
+NC_000962.3	2229801	.	C	C	.	.	.	GT	0/0
+NC_000962.3	2260100	.	C	C	.	.	.	GT	0/0
+NC_000962.3	2328543	.	T	T	.	.	.	GT	0/0
+NC_000962.3	2331620	.	A	A	.	.	.	GT	0/0
+NC_000962.3	2331789	.	G	G	.	.	.	GT	0/0
+NC_000962.3	2339255	.	A	A	.	.	.	GT	0/0
+NC_000962.3	2369186	.	G	G	.	.	.	GT	0/0
+NC_000962.3	2388641	.	G	G	.	.	.	GT	0/0
+NC_000962.3	2413246	.	C	C	.	.	.	GT	0/0
+NC_000962.3	2421816	.	A	A	.	.	.	GT	0/0
+NC_000962.3	2425471	.	T	T	.	.	.	GT	0/0
+NC_000962.3	2448458	.	C	C	.	.	.	GT	0/0
+NC_000962.3	2470591	.	A	A	.	.	.	GT	0/0
+NC_000962.3	2619271	.	T	T	.	.	.	GT	0/0
+NC_000962.3	2723506	.	T	T	.	.	.	GT	0/0
+NC_000962.3	2740693	.	T	T	.	.	.	GT	0/0
+NC_000962.3	2791098	.	C	C	.	.	.	GT	0/0
+NC_000962.3	2807486	.	C	C	.	.	.	GT	0/0
+NC_000962.3	2825466	.	G	G	.	.	.	GT	0/0
+NC_000962.3	2841022	.	A	A	.	.	.	GT	0/0
+NC_000962.3	2847281	.	A	A	.	.	.	GT	0/0
+NC_000962.3	2886570	.	A	A	.	.	.	GT	0/0
+NC_000962.3	2925962	.	T	T	.	.	.	GT	0/0
+NC_000962.3	2988630	.	C	C	.	.	.	GT	0/0
+NC_000962.3	2994187	.	T	T	.	.	.	GT	0/0
+NC_000962.3	3010420	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3027798	.	T	T	.	.	.	GT	0/0
+NC_000962.3	3031168	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3079877	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3104189	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3112877	.	G	G	.	.	.	GT	0/0
+NC_000962.3	3174496	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3180988	.	C	C	.	.	.	GT	0/0
+NC_000962.3	3189242	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3266030	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3314412	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3326554	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3420825	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3454263	.	C	C	.	.	.	GT	0/0
+NC_000962.3	3466919	.	C	C	.	.	.	GT	0/0
+NC_000962.3	3467465	.	C	C	.	.	.	GT	0/0
+NC_000962.3	3480789	.	T	T	.	.	.	GT	0/0
+NC_000962.3	3510120	.	T	T	.	.	.	GT	0/0
+NC_000962.3	3530955	.	C	C	.	.	.	GT	0/0
+NC_000962.3	3638093	.	G	G	.	.	.	GT	0/0
+NC_000962.3	3670040	.	C	C	.	.	.	GT	0/0
+NC_000962.3	3681548	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3690016	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3693681	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3830695	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3845695	.	C	C	.	.	.	GT	0/0
+NC_000962.3	3851887	.	A	A	.	.	.	GT	0/0
+NC_000962.3	3851888	.	T	T	.	.	.	GT	0/0
+NC_000962.3	3871246	.	T	T	.	.	.	GT	0/0
+NC_000962.3	3893480	.	G	G	.	.	.	GT	0/0
+NC_000962.3	3895727	.	C	C	.	.	.	GT	0/0
+NC_000962.3	3909235	.	G	G	.	.	.	GT	0/0
+NC_000962.3	3984321	.	G	G	.	.	.	GT	0/0
+NC_000962.3	4005114	.	G	G	.	.	.	GT	0/0
+NC_000962.3	4008747	.	A	A	.	.	.	GT	0/0
+NC_000962.3	4028752	.	A	A	.	.	.	GT	0/0
+NC_000962.3	4056416	.	C	C	.	.	.	GT	0/0
+NC_000962.3	4089058	.	T	T	.	.	.	GT	0/0
+NC_000962.3	4095295	.	T	T	.	.	.	GT	0/0
+NC_000962.3	4107074	.	T	T	.	.	.	GT	0/0
+NC_000962.3	4112429	.	T	T	.	.	.	GT	0/0
+NC_000962.3	4145737	.	A	A	.	.	.	GT	0/0
+NC_000962.3	4156503	.	C	C	.	.	.	GT	0/0
+NC_000962.3	4179089	.	C	C	.	.	.	GT	0/0
+NC_000962.3	4215484	.	G	G	.	.	.	GT	0/0
+NC_000962.3	4217557	.	A	A	.	.	.	GT	0/0
+NC_000962.3	4251297	.	G	G	.	.	.	GT	0/0
+NC_000962.3	4287164	.	A	A	.	.	.	GT	0/0
+NC_000962.3	4313128	.	C	C	.	.	.	GT	0/0
+NC_000962.3	4329782	.	G	G	.	.	.	GT	0/0
+NC_000962.3	4372353	.	G	G	.	.	.	GT	0/0
+NC_000962.3	4383655	.	A	A	.	.	.	GT	0/0
+NC_000962.3	4384007	.	C	C	.	.	.	GT	0/0
+NC_000962.3	4407588	.	T	T	.	.	.	GT	0/0
+NC_000962.3	4408923	.	C	C	.	.	.	GT	0/0
diff --git a/workflow/rules/mtb_prepare_files.smk b/workflow/rules/mtb_prepare_files.smk
index 0d13f0f..071afbd 100644
--- a/workflow/rules/mtb_prepare_files.smk
+++ b/workflow/rules/mtb_prepare_files.smk
@@ -45,6 +45,31 @@ rule copy_sample_vcf:
 cp {input.vcf} {output.vcf} 2>&1>{log}
         """
 
+# This rule adds VCF entries needed for lineage typing to the normal VCF
+# The VCF entries are only added if missing and contain only reference calls (0/0 Genotype)
+# fast-lineage-caller only considers variants in the VCF so these need to be added explicitly
+rule prepare_lineage_typing_vcf:
+    input:
+        lineage4_vcf="files/mtb/lineage4_lipworth.vcf",
+        vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf"
+    output:
+        vcf=temp(OUT + "/mtb_typing/prepared_files/lineage_typing_vcf/{sample}.vcf"),
+    conda:
+        "../envs/gatk_picard.yaml"
+    container:
+        "docker://staphb/bedtools:2.30.0"
+    log:
+        OUT + "/log/prepare_lineage_typing_vcf/{sample}.log"
+    threads: config["threads"]["bedtools"]
+    resources:
+        mem_gb=config["mem_gb"]["bedtools"]
+    shell:
+        """
+cat {input.vcf} <(bedtools intersect -a {input.lineage4_vcf} -b {input.vcf} -v) | \
+bedtools sort 1> {output.vcf} \
+2>{log}
+        """
+
 
 rule index_sample_vcf_gatk:
     input:
diff --git a/workflow/rules/mtb_typing.smk b/workflow/rules/mtb_typing.smk
index e67ecbc..739e6fe 100644
--- a/workflow/rules/mtb_typing.smk
+++ b/workflow/rules/mtb_typing.smk
@@ -1,6 +1,6 @@
 rule mtb_lineage_id:
     input:
-        vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf",
+        vcf=OUT + "/mtb_typing/prepared_files/lineage_typing_vcf/{sample}.vcf",
     output:
         tsv=temp(OUT + "/mtb_typing/lineage_call_standard/{sample}.tsv"),
     conda:
diff --git a/workflow/scripts/combine_lineage_calls.py b/workflow/scripts/combine_lineage_calls.py
index 4683f47..e4197a3 100644
--- a/workflow/scripts/combine_lineage_calls.py
+++ b/workflow/scripts/combine_lineage_calls.py
@@ -4,50 +4,85 @@
 import pandas as pd
 import re
 
+# define lineage4 sublineages from PMID 30789126
+lineage4_sublineages = ["ghana", "xtype", "haarlem", "ural", "tur", "lam", "stype", "uganda", "cameroon"]
+
+def parse_str_to_dict(lineage_str):
+    lineages = lineage_str.split(",")
+    parsed_data = {}
+    for lineage in lineages:
+        # get name of the lineage
+        lineage_name = lineage.split("(")[0]
+        snp_string = lineage.split("(")[1].split(")")[0]
+        # get the count of identified snps and count of total snps
+        snp_counts = int(snp_string.split("/")[0])
+        snp_total = int(snp_string.split("/")[1])
+        snp_ratio = snp_counts / snp_total
+        parsed_data[lineage_name] = (snp_counts, snp_total, snp_ratio)
+    return parsed_data
+
+def get_highest_ratio(lineage_dict):
+    if len(lineage_dict) == 1:
+        return list(lineage_dict.keys())
+    elif len(lineage_dict) == 0:
+        return []
+    else:
+        # get the highest ratio
+        max_ratio = max([lineage_dict[lineage][2] for lineage in lineage_dict.keys()])
+        # get the lineage(s) with the highest ratio
+        max_lineages = [lineage for lineage in lineage_dict.keys() if lineage_dict[lineage][2] == max_ratio]
+        return max_lineages
+
+def decide_type(lineage_dict, sublineages):
+    output = []
+    max_lineages = get_highest_ratio(lineage_dict)
+    for lineage in max_lineages:
+        output.append(f"{lineage}({lineage_dict[lineage][0]}/{lineage_dict[lineage][1]})")
+        if lineage == "lineage4":
+            # find the next highest after lineage4
+            lineage_dict_copy = lineage_dict.copy()
+            lineage_dict_copy.pop("lineage4")
+            max_lineages_without_lineage4 = get_highest_ratio(lineage_dict_copy)
+            for sublineage in max_lineages_without_lineage4:
+                if sublineage in sublineages:
+                    output.append(f"{sublineage}({lineage_dict[sublineage][0]}/{lineage_dict[sublineage][1]})")
+    return ",".join(sorted(output))
 
 def parse_value(value):
+    """
+    Parse a column from fast-lineage-caller
+
+    Results have a format: typeA(a/b)[,typeB(x/y)]
+
+    Where:
+    - typeA and typeB are the identified lineage
+    - a and x are the number of identified SNPs for lineages typeA and typeB, resp.
+    - b and y are the number of lineage-specific SNPs in the schemes of typeA and typeB, resp.
+
+    The ancestral lineage4, to which the reference genome belongs, is identified by conserved positions.
+    If another type is confidently identified in addition to lineage4, this means the other type is preferred.
+    Only if lineage4 is the sole confidently identified type, it should be outputted
+    """
     if not isinstance(value, str):
-        outcome = value
+        top_hit = value
     elif value == "NA":
-        outcome = "NA"
-    elif len(value.split(",")) == 1:
-        # remove string matching pattern "([0-9]+/[0-9]+)" from value
-        outcome = re.sub(r"\([0-9]+/[0-9]+\)", "", value)
+        top_hit = "NA"
     else:
-        # make a list of identified types
-        list_of_types = value.split(",")
-        
-        max_ratio = -1
-        list_types_with_max_ratio = []
-
-        for item in list_of_types:
-            # get type name by removing count matching pattern "([0-9]+/[0-9]+)"
-            type_name = re.sub(r"\([0-9]+/[0-9]+\)", "", item)
-            # get count matching pattern "([0-9]+/[0-9]+)"
-            count_str = re.search(r"\([0-9]+/[0-9]+\)", item).group()
-            # get first and second number from count_str,
-            # indicating identified nr of SNPs and total nr of SNPs for that type
-            counts = count_str.strip("()").split("/")
-            ratio_type = int(counts[0]) / int(counts[1])
-            if ratio_type > max_ratio:
-                max_ratio = ratio_type
-                list_types_with_max_ratio = [type_name]
-            elif ratio_type == max_ratio:
-                list_types_with_max_ratio.append(type_name)
-
-        # get type with highest ratio
-        outcome = ",".join(list_types_with_max_ratio)
-
-    return outcome
+        parsed_lineages = parse_str_to_dict(value)
+        top_hit = decide_type(parsed_lineages, lineage4_sublineages)
+    return top_hit
 
 def parse_counts(df_input):
     df = df_input.copy()
     df.columns = [f"{col}_counts" if col != "Isolate" else col for col in df.columns]
+    list_warnings = []
     for col in df.columns[1:]:
         # remove _counts from column name
         col_name = col.replace("_counts", "")
         # get string value from first row, column matching col
-        df[col_name] = parse_value(df[col].iloc[0])
+        best_type = parse_value(df[col].iloc[0])
+        df[col_name] = best_type
+
     return df
 
 

From 514d8067666598cc9fce071f80e3cedc9b51197e Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Tue, 3 Sep 2024 11:09:53 +0200
Subject: [PATCH 6/9] fix: dont use contig number in consensus fasta header

---
 workflow/rules/consensus.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/consensus.smk b/workflow/rules/consensus.smk
index 07c2ca7..1163176 100644
--- a/workflow/rules/consensus.smk
+++ b/workflow/rules/consensus.smk
@@ -231,7 +231,7 @@ rule replace_fasta_header:
         """
 seqkit replace \
 -p ^ \
--r '{wildcards.sample}_contig{{nr}} ' \
+-r '{wildcards.sample} ' \
 {input.fasta} \
 1> {output.fasta} \
 2>{log}

From 523a1571d99f15edcdd688241d6a8565414c7508 Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Mon, 9 Sep 2024 09:11:34 +0200
Subject: [PATCH 7/9] fix: fix run_pipeline.sh

---
 run_pipeline.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_pipeline.sh b/run_pipeline.sh
index 56cb9f6..531b56f 100755
--- a/run_pipeline.sh
+++ b/run_pipeline.sh
@@ -25,7 +25,7 @@ fi
 #----------------------------------------------#
 # Create/update necessary environments
 PATH_MAMBA_YAML="envs/mamba.yaml"
-PATH_MASTER_YAML="envs/template_master.yaml"
+PATH_MASTER_YAML="envs/juno_variant_typing.yaml"
 MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ')
 MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ')
 

From e6dbab75181cda89e780eb0428833526683a0755 Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Mon, 9 Sep 2024 10:19:29 +0200
Subject: [PATCH 8/9] fix: fix run_pipeline.sh

---
 run_pipeline.sh | 93 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 74 insertions(+), 19 deletions(-)

diff --git a/run_pipeline.sh b/run_pipeline.sh
index 531b56f..aa2c468 100755
--- a/run_pipeline.sh
+++ b/run_pipeline.sh
@@ -2,18 +2,29 @@
 
 set -euo pipefail
 
+set -x
+
 #----------------------------------------------#
 # User parameters
-if [ ! -z "${1}" ] || [ ! -z "${2}" ] #|| [ ! -z "${irods_input_projectID}" ]
+if [ ! -z "${1}" ] || [ ! -z "${2}" ] || [ ! -z "${irods_input_projectID}" ]
 then
    input_dir="${1}"
    output_dir="${2}"
-#    PROJECT_NAME="${irods_input_projectID}"
+   PROJECT_NAME="${irods_input_projectID}"
+   EXCLUSION_FILE=""
 else
     echo "One of the parameters is missing, make sure there is an input directory, output directory and project name(param 1, 2 or irods_input_projectID)."
     exit 1
 fi
 
+set +u
+#check if there is an exclusion file, if so change the parameter
+if [ ! -z "${irods_input_sequencing__run_id}" ] && [ -f "/data/BioGrid/NGSlab/sample_sheets/${irods_input_sequencing__run_id}.exclude" ]
+then
+  EXCLUSION_FILE="/data/BioGrid/NGSlab/sample_sheets/${irods_input_sequencing__run_id}.exclude"
+fi
+set -u
+
 if [ ! -d "${input_dir}" ] || [ ! -d "${output_dir}" ]
 then
   echo "The input directory $input_dir, output directory $output_dir or fastq dir ${input_dir}/clean_fastq does not exist"
@@ -23,29 +34,50 @@ else
 fi
 
 #----------------------------------------------#
-# Create/update necessary environments
-PATH_MAMBA_YAML="envs/mamba.yaml"
-PATH_MASTER_YAML="envs/juno_variant_typing.yaml"
-MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ')
-MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ')
+## make sure conda works
 
-echo -e "\nUpdating necessary environments to run the pipeline..."
-
-# Removing strict mode because it sometimes breaks the code for 
-# activating an environment and for testing whether some variables
-# are set or not
-set +euo pipefail 
+# >>> conda initialize >>>
+# !! Contents within this block are managed by 'conda init' !!
+__conda_setup="$('/mnt/miniconda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
+if [ $? -eq 0 ]; then
+    eval "$__conda_setup"
+else
+    if [ -f "/mnt/miniconda/etc/profile.d/conda.sh" ]; then
+        . "/mnt/miniconda/etc/profile.d/conda.sh"
+    else
+        export PATH="/mnt/miniconda/bin:$PATH"
+    fi
+fi
+unset __conda_setup
+# <<< conda initialize <<<export -f conda
+export -f __conda_activate
+export -f __conda_reactivate
+export -f __conda_hashr
 
-conda env update -f "${PATH_MAMBA_YAML}"
-source activate "${MAMBA_NAME}"
 
-mamba env update -f "${PATH_MASTER_YAML}"
+#----------------------------------------------#
+# Create the environment
 
-source activate "${MASTER_NAME}"
+# we can use the base installation of mamba to create the environment. 
+# Swapping to a parent env is not necessary anymore.
+mamba env create -f envs/juno_variant_typing.yaml --name pipeline_env
+conda activate pipeline_env
 
 #----------------------------------------------#
 # Run the pipeline
 
+case $PROJECT_NAME in
+  adhoc)
+    SPECIES="NotProvided"
+    ;;
+  myco|myco_kncv)
+    SPECIES="Mycobacterium_tuberculosis"
+    ;;
+  *)
+    SPECIES="NotProvided"
+    ;;
+esac
+
 echo -e "\nRun pipeline..."
 
 if [ ! -z ${irods_runsheet_sys__runsheet__lsf_queue} ]; then
@@ -54,9 +86,32 @@ else
     QUEUE="bio"
 fi
 
-set -euo pipefail
+# Setting up the tmpdir for singularity as the current directory (default is /tmp but it gets full easily)
+# Containers will use it for storing tmp files when building a container
+export SINGULARITY_TMPDIR="$(pwd)"
 
-python template.py --queue "${QUEUE}" -i "${input_dir}" -o "${output_dir}"
+#without exclusion file
+if [ "${EXCLUSION_FILE}" == "" ]
+then
+    python juno_variant_typing.py \
+        --queue "${QUEUE}" \
+        -i "${input_dir}" \
+        -o "${output_dir}" \
+        -s "${SPECIES}" \
+        --prefix "/mnt/db/juno/sing_containers"
+
+        result=$?
+else
+    python juno_variant_typing.py \
+        --queue "${QUEUE}" \
+        -i "${input_dir}" \
+        -o "${output_dir}" \
+        -s "${SPECIES}" \
+        --prefix "/mnt/db/juno/sing_containers" \
+        -ex "${EXCLUSION_FILE}"
+
+        result=$?
+fi
 
 result=$?
 

From 513d8abdeaf4762718497e57cf84f487b6e3a2da Mon Sep 17 00:00:00 2001
From: boasvdp <boasvdp@gmail.com>
Date: Thu, 12 Sep 2024 10:51:22 +0200
Subject: [PATCH 9/9] style: formatting

---
 workflow/rules/mtb_prepare_files.smk      |  7 +++--
 workflow/rules/mtb_typing.smk             |  6 ++--
 workflow/scripts/combine_lineage_calls.py | 34 +++++++++++++++++++----
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/workflow/rules/mtb_prepare_files.smk b/workflow/rules/mtb_prepare_files.smk
index 071afbd..18f78e4 100644
--- a/workflow/rules/mtb_prepare_files.smk
+++ b/workflow/rules/mtb_prepare_files.smk
@@ -45,13 +45,14 @@ rule copy_sample_vcf:
 cp {input.vcf} {output.vcf} 2>&1>{log}
         """
 
+
 # This rule adds VCF entries needed for lineage typing to the normal VCF
 # The VCF entries are only added if missing and contain only reference calls (0/0 Genotype)
 # fast-lineage-caller only considers variants in the VCF so these need to be added explicitly
 rule prepare_lineage_typing_vcf:
     input:
         lineage4_vcf="files/mtb/lineage4_lipworth.vcf",
-        vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf"
+        vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf",
     output:
         vcf=temp(OUT + "/mtb_typing/prepared_files/lineage_typing_vcf/{sample}.vcf"),
     conda:
@@ -59,10 +60,10 @@ rule prepare_lineage_typing_vcf:
     container:
         "docker://staphb/bedtools:2.30.0"
     log:
-        OUT + "/log/prepare_lineage_typing_vcf/{sample}.log"
+        OUT + "/log/prepare_lineage_typing_vcf/{sample}.log",
     threads: config["threads"]["bedtools"]
     resources:
-        mem_gb=config["mem_gb"]["bedtools"]
+        mem_gb=config["mem_gb"]["bedtools"],
     shell:
         """
 cat {input.vcf} <(bedtools intersect -a {input.lineage4_vcf} -b {input.vcf} -v) | \
diff --git a/workflow/rules/mtb_typing.smk b/workflow/rules/mtb_typing.smk
index 739e6fe..aa65dde 100644
--- a/workflow/rules/mtb_typing.smk
+++ b/workflow/rules/mtb_typing.smk
@@ -23,6 +23,7 @@ fast-lineage-caller \
 2>&1>{log}
         """
 
+
 rule mtb_lineage_id_custom:
     input:
         vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf",
@@ -40,7 +41,7 @@ rule mtb_lineage_id_custom:
     resources:
         mem_gb=config["mem_gb"]["fast-lineage-caller"],
     params:
-        scheme = "files/mtb/snpCL_scheme.tsv",
+        scheme="files/mtb/snpCL_scheme.tsv",
     shell:
         """
 fast-lineage-caller \
@@ -50,13 +51,14 @@ fast-lineage-caller \
 2>&1>{log}
         """
 
+
 rule combine_lineage_typing:
     input:
         lineage_standard=OUT + "/mtb_typing/lineage_call_standard/{sample}.tsv",
         lineage_custom=OUT + "/mtb_typing/lineage_call_custom/{sample}.tsv",
     output:
         tsv=OUT + "/mtb_typing/lineage_call/{sample}.tsv",
-    conda:  
+    conda:
         "../envs/scripts.yaml"
     log:
         OUT + "/log/combine_lineage_typing/{sample}.log",
diff --git a/workflow/scripts/combine_lineage_calls.py b/workflow/scripts/combine_lineage_calls.py
index e4197a3..fe02177 100644
--- a/workflow/scripts/combine_lineage_calls.py
+++ b/workflow/scripts/combine_lineage_calls.py
@@ -5,7 +5,18 @@
 import re
 
 # define lineage4 sublineages from PMID 30789126
-lineage4_sublineages = ["ghana", "xtype", "haarlem", "ural", "tur", "lam", "stype", "uganda", "cameroon"]
+lineage4_sublineages = [
+    "ghana",
+    "xtype",
+    "haarlem",
+    "ural",
+    "tur",
+    "lam",
+    "stype",
+    "uganda",
+    "cameroon",
+]
+
 
 def parse_str_to_dict(lineage_str):
     lineages = lineage_str.split(",")
@@ -21,6 +32,7 @@ def parse_str_to_dict(lineage_str):
         parsed_data[lineage_name] = (snp_counts, snp_total, snp_ratio)
     return parsed_data
 
+
 def get_highest_ratio(lineage_dict):
     if len(lineage_dict) == 1:
         return list(lineage_dict.keys())
@@ -30,14 +42,21 @@ def get_highest_ratio(lineage_dict):
         # get the highest ratio
         max_ratio = max([lineage_dict[lineage][2] for lineage in lineage_dict.keys()])
         # get the lineage(s) with the highest ratio
-        max_lineages = [lineage for lineage in lineage_dict.keys() if lineage_dict[lineage][2] == max_ratio]
+        max_lineages = [
+            lineage
+            for lineage in lineage_dict.keys()
+            if lineage_dict[lineage][2] == max_ratio
+        ]
         return max_lineages
 
+
 def decide_type(lineage_dict, sublineages):
     output = []
     max_lineages = get_highest_ratio(lineage_dict)
     for lineage in max_lineages:
-        output.append(f"{lineage}({lineage_dict[lineage][0]}/{lineage_dict[lineage][1]})")
+        output.append(
+            f"{lineage}({lineage_dict[lineage][0]}/{lineage_dict[lineage][1]})"
+        )
         if lineage == "lineage4":
             # find the next highest after lineage4
             lineage_dict_copy = lineage_dict.copy()
@@ -45,9 +64,12 @@ def decide_type(lineage_dict, sublineages):
             max_lineages_without_lineage4 = get_highest_ratio(lineage_dict_copy)
             for sublineage in max_lineages_without_lineage4:
                 if sublineage in sublineages:
-                    output.append(f"{sublineage}({lineage_dict[sublineage][0]}/{lineage_dict[sublineage][1]})")
+                    output.append(
+                        f"{sublineage}({lineage_dict[sublineage][0]}/{lineage_dict[sublineage][1]})"
+                    )
     return ",".join(sorted(output))
 
+
 def parse_value(value):
     """
     Parse a column from fast-lineage-caller
@@ -72,6 +94,7 @@ def parse_value(value):
         top_hit = decide_type(parsed_lineages, lineage4_sublineages)
     return top_hit
 
+
 def parse_counts(df_input):
     df = df_input.copy()
     df.columns = [f"{col}_counts" if col != "Isolate" else col for col in df.columns]
@@ -94,7 +117,6 @@ def main(args):
 
     df_custom = pd.read_csv(args.custom, sep="\t")
 
-
     # merge all dataframes horizontally on Isolate
     merged_df = pd.merge(df_standard_parsed, df_custom, on="Isolate")
 
@@ -112,4 +134,4 @@ def main(args):
 
     args = parser.parse_args()
 
-    main(args)
\ No newline at end of file
+    main(args)