From 2d075d7a6bc8897425f3d28fc8047ccbff8e65bb Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Jul 2024 16:59:12 +0200 Subject: [PATCH 1/9] feat: add detection of snpCLs --- Snakefile | 1 + files/mtb/snpCL_scheme.tsv | 31 +++++++++++++++ workflow/envs/scripts.yaml | 4 ++ workflow/rules/mtb_typing.smk | 48 ++++++++++++++++++++++- workflow/scripts/combine_lineage_calls.py | 26 ++++++++++++ 5 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 files/mtb/snpCL_scheme.tsv create mode 100644 workflow/envs/scripts.yaml create mode 100644 workflow/scripts/combine_lineage_calls.py diff --git a/Snakefile b/Snakefile index 8719d8e..4c6efff 100644 --- a/Snakefile +++ b/Snakefile @@ -26,6 +26,7 @@ localrules: copy_sample_vcf, copy_deletion_vcf, copy_ref, + combine_lineage_typing, mtb_filter_res_table_positions, mtb_make_json, audit_version_gatk, diff --git a/files/mtb/snpCL_scheme.tsv b/files/mtb/snpCL_scheme.tsv new file mode 100644 index 0000000..0a7fe4c --- /dev/null +++ b/files/mtb/snpCL_scheme.tsv @@ -0,0 +1,31 @@ +#lineage position allele_change tag +snpCL_1 1157317 C/T EUSeqMyTB_snpCL +snpCL_2 4327088 A/C EUSeqMyTB_snpCL +snpCL_3 130881 C/G EUSeqMyTB_snpCL +snpCL_4 1485300 C/G EUSeqMyTB_snpCL +snpCL_5 1208477 C/G EUSeqMyTB_snpCL +snpCL_6 1547828 G/C EUSeqMyTB_snpCL +snpCL_7 2971513 C/G EUSeqMyTB_snpCL +snpCL_8 1039921 A/G EUSeqMyTB_snpCL +snpCL_10 211015 G/A EUSeqMyTB_snpCL +snpCL_11 1895566 C/G EUSeqMyTB_snpCL +snpCL_12 1914071 G/A EUSeqMyTB_snpCL +snpCL_13 1810244 G/A EUSeqMyTB_snpCL +snpCL_14 4155977 G/A EUSeqMyTB_snpCL +snpCL_15 2399093 G/A EUSeqMyTB_snpCL +snpCL_16 1008074 G/A EUSeqMyTB_snpCL +snpCL_17 4284172 C/T EUSeqMyTB_snpCL +snpCL_18 766707 A/G EUSeqMyTB_snpCL +snpCL_19 3169491 C/A EUSeqMyTB_snpCL +snpCL_20 996197 A/C EUSeqMyTB_snpCL +snpCL_22 1231934 T/G EUSeqMyTB_snpCL +snpCL_23 1398622 G/A EUSeqMyTB_snpCL +snpCL_25 1068731 G/C EUSeqMyTB_snpCL +snpCL_27 3031515 T/C EUSeqMyTB_snpCL +snpCL_28 1028437 C/T EUSeqMyTB_snpCL +snpCL_29 3640351 C/T EUSeqMyTB_snpCL +snpCL_30 3223901 G/A EUSeqMyTB_snpCL +snpCL_31 1946519 C/T EUSeqMyTB_snpCL +snpCL_32 3088899 G/A EUSeqMyTB_snpCL +snpCL_33 1921877 G/A EUSeqMyTB_snpCL +snpCL_34 1760095 C/G EUSeqMyTB_snpCL diff --git a/workflow/envs/scripts.yaml b/workflow/envs/scripts.yaml new file mode 100644 index 0000000..8d303cc --- /dev/null +++ b/workflow/envs/scripts.yaml @@ -0,0 +1,4 @@ +channels: +- conda-forge +dependencies: +- pandas=2.2.* \ No newline at end of file diff --git a/workflow/rules/mtb_typing.smk b/workflow/rules/mtb_typing.smk index bac3849..1528755 100644 --- a/workflow/rules/mtb_typing.smk +++ b/workflow/rules/mtb_typing.smk @@ -2,7 +2,7 @@ rule mtb_lineage_id: input: vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf", output: - tsv=OUT + "/mtb_typing/lineage_call/{sample}.tsv", + tsv=temp(OUT + "/mtb_typing/lineage_call_standard/{sample}.tsv"), conda: "../envs/fast_lineage_caller.yaml" container: @@ -22,6 +22,52 @@ fast-lineage-caller \ 2>&1>{log} """ +rule mtb_lineage_id_custom: + input: + vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf", + output: + tsv=temp(OUT + "/mtb_typing/lineage_call_custom/{sample}.tsv"), + conda: + "../envs/fast_lineage_caller.yaml" + container: + "docker://ghcr.io/boasvdp/fast_lineage_caller:1.0.0" + log: + OUT + "/log/mtb_lineage_id/{sample}.log", + message: + "Typing Mtb lineage for {wildcards.sample}" + threads: config["threads"]["fast-lineage-caller"] + resources: + mem_gb=config["mem_gb"]["fast-lineage-caller"], + params: + scheme = "files/mtb/snpCL_scheme.tsv", + shell: + """ +fast-lineage-caller \ +--out {output.tsv} \ +--scheme {params.scheme} \ +{input.vcf} \ +2>&1>{log} + """ + +rule combine_lineage_typing: + input: + lineage_standard=OUT + "/mtb_typing/lineage_call_standard/{sample}.tsv", + lineage_custom=OUT + "/mtb_typing/lineage_call_custom/{sample}.tsv", + output: + tsv=OUT + "/mtb_typing/lineage_call/{sample}.tsv", + conda: + "../envs/scripts.yaml" + log: + OUT + "/log/combine_lineage_typing/{sample}.log", + message: + "Combining lineage calls for {wildcards.sample}" + shell: + """ +python workflow/scripts/combine_lineage_calls.py \ +{input.lineage_standard} {input.lineage_custom} \ +--output {output.tsv} 2>&1>{log} + """ + rule mtb_coll_contamination: input: diff --git a/workflow/scripts/combine_lineage_calls.py b/workflow/scripts/combine_lineage_calls.py new file mode 100644 index 0000000..50d411a --- /dev/null +++ b/workflow/scripts/combine_lineage_calls.py @@ -0,0 +1,26 @@ +# merge two dataframes and assert a single row is left + +import argparse +import pandas as pd + +parser = argparse.ArgumentParser(description="Combine lineage calls") + +parser.add_argument("lineage_calls", help="Path to lineage calls", nargs="+") +parser.add_argument("--output", help="Path to output file") + +args = parser.parse_args() + +# read in all lineage calls +dfs = [] +for lineage_call in args.lineage_calls: + df = pd.read_csv(lineage_call, sep="\t") + dfs.append(df) + +# merge all dataframes horizontally on Isolate +merged_df = dfs[0] +for df in dfs[1:]: + merged_df = pd.merge(merged_df, df, on="Isolate") + +assert merged_df.shape[0] == 1 + +merged_df.to_csv(args.output, sep="\t", index=False) From fbd67e63bf67a7ae8f010c29a3df97626b43ba8f Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Jul 2024 16:59:26 +0200 Subject: [PATCH 2/9] deps: update juno-lib --- envs/juno_variant_typing.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/envs/juno_variant_typing.yaml b/envs/juno_variant_typing.yaml index 4477df6..36b2103 100644 --- a/envs/juno_variant_typing.yaml +++ b/envs/juno_variant_typing.yaml @@ -12,4 +12,4 @@ dependencies: - pip=23.* - python=3.11.* - pip: - - "--editable=git+https://github.com/RIVM-bioinformatics/juno-library.git@v2.2.0#egg=juno_library" + - "--editable=git+https://github.com/RIVM-bioinformatics/juno-library.git@v2.2.1#egg=juno_library" From 5d5991adaf45cc7a19a9bf9010170fae928b801f Mon Sep 17 00:00:00 2001 From: boasvdp Date: Fri, 26 Jul 2024 16:59:48 +0200 Subject: [PATCH 3/9] fix: generalize binding paths when using container --- juno_variant_typing.py | 53 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/juno_variant_typing.py b/juno_variant_typing.py index c688a1b..8d5ab0f 100644 --- a/juno_variant_typing.py +++ b/juno_variant_typing.py @@ -138,11 +138,27 @@ def setup(self) -> None: self.set_presets() if self.snakemake_args["use_singularity"]: + + paths_from_presets = [] + + for key, value in self.species_presets.items(): + try: + path = Path(value) + except TypeError: + continue + if path.exists() & path.is_absolute(): + paths_from_presets.append(path.parent.resolve()) + + unique_bind_paths = self.get_unique_bind_paths( + paths_from_presets, iterations=3 + ) + self.snakemake_args["singularity_args"] = " ".join( [ self.snakemake_args["singularity_args"], f"--bind {self.db_dir}:{self.db_dir}", - ] # paths that singularity should be able to read from can be bound by adding to the above list + ] + + [f"--bind {str(path)}:{str(path)}" for path in unique_bind_paths] ) # # Extra class methods for this pipeline can be invoked here @@ -188,18 +204,51 @@ def update_sample_dict_with_metadata(self) -> None: ) def set_presets(self) -> None: + # if no custom presets were provided, look in default location if self.presets_path is None: self.presets_path = Path(__file__).parent.joinpath("config/presets.yaml") + # read all presets into dict with open(self.presets_path) as f: presets_dict = yaml.safe_load(f) + # update sample dict with presets for sample in self.sample_dict: species_name = self.sample_dict[sample]["species"] if species_name in presets_dict.keys(): - for key, value in presets_dict[species_name].items(): + # store species-specific presets in self.species_presets for potential reuse + self.species_presets = presets_dict[species_name] + for key, value in self.species_presets.items(): self.sample_dict[sample][key] = value + def remove_from_list(self, lst, elements): + for element in elements: + if element in lst: + lst.remove(element) + return lst + + def simplify_bind_paths(self, paths): + unique_bind_paths = paths.copy() + for path1 in paths: + for path2 in paths: + if path1 == path2: + continue + elif path1 == path2.parent: + self.remove_from_list(unique_bind_paths, [path2]) + elif path1.parent == path2.parent: + self.remove_from_list(unique_bind_paths, [path1, path2]) + unique_bind_paths.append(path1.parent) + else: + continue + + return unique_bind_paths + + def get_unique_bind_paths(self, paths, iterations=3): + unique_bind_paths = paths.copy() + for _ in range(iterations): + unique_bind_paths = self.simplify_bind_paths(unique_bind_paths) + return unique_bind_paths + if __name__ == "__main__": main() From d28718d23fbdc2eb619eccd15f5cb7f771b160f0 Mon Sep 17 00:00:00 2001 From: boasvdp Date: Mon, 19 Aug 2024 13:29:59 +0200 Subject: [PATCH 4/9] fix: add counts to lineage scoring --- workflow/rules/mtb_typing.smk | 4 +- workflow/scripts/combine_lineage_calls.py | 84 +++++++++++++++++++---- 2 files changed, 72 insertions(+), 16 deletions(-) diff --git a/workflow/rules/mtb_typing.smk b/workflow/rules/mtb_typing.smk index 1528755..e67ecbc 100644 --- a/workflow/rules/mtb_typing.smk +++ b/workflow/rules/mtb_typing.smk @@ -17,6 +17,7 @@ rule mtb_lineage_id: shell: """ fast-lineage-caller \ +--count \ --out {output.tsv} \ {input.vcf} \ 2>&1>{log} @@ -64,7 +65,8 @@ rule combine_lineage_typing: shell: """ python workflow/scripts/combine_lineage_calls.py \ -{input.lineage_standard} {input.lineage_custom} \ +--standard {input.lineage_standard} \ +--custom {input.lineage_custom} \ --output {output.tsv} 2>&1>{log} """ diff --git a/workflow/scripts/combine_lineage_calls.py b/workflow/scripts/combine_lineage_calls.py index 50d411a..4683f47 100644 --- a/workflow/scripts/combine_lineage_calls.py +++ b/workflow/scripts/combine_lineage_calls.py @@ -2,25 +2,79 @@ import argparse import pandas as pd +import re -parser = argparse.ArgumentParser(description="Combine lineage calls") -parser.add_argument("lineage_calls", help="Path to lineage calls", nargs="+") -parser.add_argument("--output", help="Path to output file") +def parse_value(value): + if not isinstance(value, str): + outcome = value + elif value == "NA": + outcome = "NA" + elif len(value.split(",")) == 1: + # remove string matching pattern "([0-9]+/[0-9]+)" from value + outcome = re.sub(r"\([0-9]+/[0-9]+\)", "", value) + else: + # make a list of identified types + list_of_types = value.split(",") + + max_ratio = -1 + list_types_with_max_ratio = [] -args = parser.parse_args() + for item in list_of_types: + # get type name by removing count matching pattern "([0-9]+/[0-9]+)" + type_name = re.sub(r"\([0-9]+/[0-9]+\)", "", item) + # get count matching pattern "([0-9]+/[0-9]+)" + count_str = re.search(r"\([0-9]+/[0-9]+\)", item).group() + # get first and second number from count_str, + # indicating identified nr of SNPs and total nr of SNPs for that type + counts = count_str.strip("()").split("/") + ratio_type = int(counts[0]) / int(counts[1]) + if ratio_type > max_ratio: + max_ratio = ratio_type + list_types_with_max_ratio = [type_name] + elif ratio_type == max_ratio: + list_types_with_max_ratio.append(type_name) -# read in all lineage calls -dfs = [] -for lineage_call in args.lineage_calls: - df = pd.read_csv(lineage_call, sep="\t") - dfs.append(df) + # get type with highest ratio + outcome = ",".join(list_types_with_max_ratio) -# merge all dataframes horizontally on Isolate -merged_df = dfs[0] -for df in dfs[1:]: - merged_df = pd.merge(merged_df, df, on="Isolate") + return outcome -assert merged_df.shape[0] == 1 +def parse_counts(df_input): + df = df_input.copy() + df.columns = [f"{col}_counts" if col != "Isolate" else col for col in df.columns] + for col in df.columns[1:]: + # remove _counts from column name + col_name = col.replace("_counts", "") + # get string value from first row, column matching col + df[col_name] = parse_value(df[col].iloc[0]) + return df -merged_df.to_csv(args.output, sep="\t", index=False) + +def main(args): + # read in all lineage calls + df_standard = pd.read_csv(args.standard, sep="\t") + + df_standard_parsed = parse_counts(df_standard) + + df_custom = pd.read_csv(args.custom, sep="\t") + + + # merge all dataframes horizontally on Isolate + merged_df = pd.merge(df_standard_parsed, df_custom, on="Isolate") + + assert merged_df.shape[0] == 1 + + merged_df.to_csv(args.output, sep="\t", index=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Combine lineage calls") + + parser.add_argument("--standard", help="Path to standard lineage calls") + parser.add_argument("--custom", help="Path to custom lineage calls") + parser.add_argument("--output", help="Path to output file") + + args = parser.parse_args() + + main(args) \ No newline at end of file From 56ff964ce005f21190177a2e9b30a520a3af44e6 Mon Sep 17 00:00:00 2001 From: boasvdp Date: Tue, 3 Sep 2024 11:09:33 +0200 Subject: [PATCH 5/9] fix: introduce vcf entries with ref call if needed --- files/mtb/lineage4_lipworth.vcf | 165 ++++++++++++++++++++++ workflow/rules/mtb_prepare_files.smk | 25 ++++ workflow/rules/mtb_typing.smk | 2 +- workflow/scripts/combine_lineage_calls.py | 97 +++++++++---- 4 files changed, 257 insertions(+), 32 deletions(-) create mode 100644 files/mtb/lineage4_lipworth.vcf diff --git a/files/mtb/lineage4_lipworth.vcf b/files/mtb/lineage4_lipworth.vcf new file mode 100644 index 0000000..78afdc2 --- /dev/null +++ b/files/mtb/lineage4_lipworth.vcf @@ -0,0 +1,165 @@ +##fileformat=VCFv4.2 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample +NC_000962.3 15117 . C C . . . GT 0/0 +NC_000962.3 42281 . C C . . . GT 0/0 +NC_000962.3 70267 . G G . . . GT 0/0 +NC_000962.3 123520 . T T . . . GT 0/0 +NC_000962.3 143207 . T T . . . GT 0/0 +NC_000962.3 206481 . C C . . . GT 0/0 +NC_000962.3 206484 . G G . . . GT 0/0 +NC_000962.3 217201 . T T . . . GT 0/0 +NC_000962.3 249522 . T T . . . GT 0/0 +NC_000962.3 251575 . G G . . . GT 0/0 +NC_000962.3 325505 . T T . . . GT 0/0 +NC_000962.3 342146 . A A . . . GT 0/0 +NC_000962.3 392261 . T T . . . GT 0/0 +NC_000962.3 445780 . C C . . . GT 0/0 +NC_000962.3 491742 . T T . . . GT 0/0 +NC_000962.3 492150 . G G . . . GT 0/0 +NC_000962.3 498531 . A A . . . GT 0/0 +NC_000962.3 505974 . G G . . . GT 0/0 +NC_000962.3 517358 . T T . . . GT 0/0 +NC_000962.3 546357 . A A . . . GT 0/0 +NC_000962.3 555991 . A A . . . GT 0/0 +NC_000962.3 584171 . T T . . . GT 0/0 +NC_000962.3 599868 . A A . . . GT 0/0 +NC_000962.3 648856 . T T . . . GT 0/0 +NC_000962.3 655986 . T T . . . GT 0/0 +NC_000962.3 659341 . T T . . . GT 0/0 +NC_000962.3 662911 . T T . . . GT 0/0 +NC_000962.3 670545 . G G . . . GT 0/0 +NC_000962.3 713310 . T T . . . GT 0/0 +NC_000962.3 757182 . A A . . . GT 0/0 +NC_000962.3 763031 . T T . . . GT 0/0 +NC_000962.3 776100 . G G . . . GT 0/0 +NC_000962.3 820752 . C C . . . GT 0/0 +NC_000962.3 847995 . T T . . . GT 0/0 +NC_000962.3 931123 . T T . . . GT 0/0 +NC_000962.3 932280 . T T . . . GT 0/0 +NC_000962.3 934230 . C C . . . GT 0/0 +NC_000962.3 934611 . G G . . . GT 0/0 +NC_000962.3 941845 . C C . . . GT 0/0 +NC_000962.3 960367 . A A . . . GT 0/0 +NC_000962.3 1024346 . A A . . . GT 0/0 +NC_000962.3 1054784 . C C . . . GT 0/0 +NC_000962.3 1080192 . G G . . . GT 0/0 +NC_000962.3 1098523 . T T . . . GT 0/0 +NC_000962.3 1104690 . T T . . . GT 0/0 +NC_000962.3 1107940 . A A . . . GT 0/0 +NC_000962.3 1144585 . A A . . . GT 0/0 +NC_000962.3 1148259 . A A . . . GT 0/0 +NC_000962.3 1211369 . A A . . . GT 0/0 +NC_000962.3 1230778 . G G . . . GT 0/0 +NC_000962.3 1248382 . A A . . . GT 0/0 +NC_000962.3 1248936 . G G . . . GT 0/0 +NC_000962.3 1250340 . A A . . . GT 0/0 +NC_000962.3 1254562 . A A . . . GT 0/0 +NC_000962.3 1281771 . T T . . . GT 0/0 +NC_000962.3 1351172 . A A . . . GT 0/0 +NC_000962.3 1367484 . T T . . . GT 0/0 +NC_000962.3 1390763 . C C . . . GT 0/0 +NC_000962.3 1479085 . T T . . . GT 0/0 +NC_000962.3 1490905 . A A . . . GT 0/0 +NC_000962.3 1540141 . T T . . . GT 0/0 +NC_000962.3 1544255 . C C . . . GT 0/0 +NC_000962.3 1546703 . C C . . . GT 0/0 +NC_000962.3 1608276 . A A . . . GT 0/0 +NC_000962.3 1618978 . T T . . . GT 0/0 +NC_000962.3 1688300 . T T . . . GT 0/0 +NC_000962.3 1716472 . A A . . . GT 0/0 +NC_000962.3 1839759 . G G . . . GT 0/0 +NC_000962.3 1849609 . T T . . . GT 0/0 +NC_000962.3 1859559 . C C . . . GT 0/0 +NC_000962.3 1931718 . G G . . . GT 0/0 +NC_000962.3 1971725 . G G . . . GT 0/0 +NC_000962.3 2010614 . G G . . . GT 0/0 +NC_000962.3 2050822 . G G . . . GT 0/0 +NC_000962.3 2094913 . A A . . . GT 0/0 +NC_000962.3 2108890 . A A . . . GT 0/0 +NC_000962.3 2122976 . C C . . . GT 0/0 +NC_000962.3 2154724 . C C . . . GT 0/0 +NC_000962.3 2158109 . T T . . . GT 0/0 +NC_000962.3 2161346 . T T . . . GT 0/0 +NC_000962.3 2167926 . A A . . . GT 0/0 +NC_000962.3 2199052 . C C . . . GT 0/0 +NC_000962.3 2209465 . G G . . . GT 0/0 +NC_000962.3 2229801 . C C . . . GT 0/0 +NC_000962.3 2260100 . C C . . . GT 0/0 +NC_000962.3 2328543 . T T . . . GT 0/0 +NC_000962.3 2331620 . A A . . . GT 0/0 +NC_000962.3 2331789 . G G . . . GT 0/0 +NC_000962.3 2339255 . A A . . . GT 0/0 +NC_000962.3 2369186 . G G . . . GT 0/0 +NC_000962.3 2388641 . G G . . . GT 0/0 +NC_000962.3 2413246 . C C . . . GT 0/0 +NC_000962.3 2421816 . A A . . . GT 0/0 +NC_000962.3 2425471 . T T . . . GT 0/0 +NC_000962.3 2448458 . C C . . . GT 0/0 +NC_000962.3 2470591 . A A . . . GT 0/0 +NC_000962.3 2619271 . T T . . . GT 0/0 +NC_000962.3 2723506 . T T . . . GT 0/0 +NC_000962.3 2740693 . T T . . . GT 0/0 +NC_000962.3 2791098 . C C . . . GT 0/0 +NC_000962.3 2807486 . C C . . . GT 0/0 +NC_000962.3 2825466 . G G . . . GT 0/0 +NC_000962.3 2841022 . A A . . . GT 0/0 +NC_000962.3 2847281 . A A . . . GT 0/0 +NC_000962.3 2886570 . A A . . . GT 0/0 +NC_000962.3 2925962 . T T . . . GT 0/0 +NC_000962.3 2988630 . C C . . . GT 0/0 +NC_000962.3 2994187 . T T . . . GT 0/0 +NC_000962.3 3010420 . A A . . . GT 0/0 +NC_000962.3 3027798 . T T . . . GT 0/0 +NC_000962.3 3031168 . A A . . . GT 0/0 +NC_000962.3 3079877 . A A . . . GT 0/0 +NC_000962.3 3104189 . A A . . . GT 0/0 +NC_000962.3 3112877 . G G . . . GT 0/0 +NC_000962.3 3174496 . A A . . . GT 0/0 +NC_000962.3 3180988 . C C . . . GT 0/0 +NC_000962.3 3189242 . A A . . . GT 0/0 +NC_000962.3 3266030 . A A . . . GT 0/0 +NC_000962.3 3314412 . A A . . . GT 0/0 +NC_000962.3 3326554 . A A . . . GT 0/0 +NC_000962.3 3420825 . A A . . . GT 0/0 +NC_000962.3 3454263 . C C . . . GT 0/0 +NC_000962.3 3466919 . C C . . . GT 0/0 +NC_000962.3 3467465 . C C . . . GT 0/0 +NC_000962.3 3480789 . T T . . . GT 0/0 +NC_000962.3 3510120 . T T . . . GT 0/0 +NC_000962.3 3530955 . C C . . . GT 0/0 +NC_000962.3 3638093 . G G . . . GT 0/0 +NC_000962.3 3670040 . C C . . . GT 0/0 +NC_000962.3 3681548 . A A . . . GT 0/0 +NC_000962.3 3690016 . A A . . . GT 0/0 +NC_000962.3 3693681 . A A . . . GT 0/0 +NC_000962.3 3830695 . A A . . . GT 0/0 +NC_000962.3 3845695 . C C . . . GT 0/0 +NC_000962.3 3851887 . A A . . . GT 0/0 +NC_000962.3 3851888 . T T . . . GT 0/0 +NC_000962.3 3871246 . T T . . . GT 0/0 +NC_000962.3 3893480 . G G . . . GT 0/0 +NC_000962.3 3895727 . C C . . . GT 0/0 +NC_000962.3 3909235 . G G . . . GT 0/0 +NC_000962.3 3984321 . G G . . . GT 0/0 +NC_000962.3 4005114 . G G . . . GT 0/0 +NC_000962.3 4008747 . A A . . . GT 0/0 +NC_000962.3 4028752 . A A . . . GT 0/0 +NC_000962.3 4056416 . C C . . . GT 0/0 +NC_000962.3 4089058 . T T . . . GT 0/0 +NC_000962.3 4095295 . T T . . . GT 0/0 +NC_000962.3 4107074 . T T . . . GT 0/0 +NC_000962.3 4112429 . T T . . . GT 0/0 +NC_000962.3 4145737 . A A . . . GT 0/0 +NC_000962.3 4156503 . C C . . . GT 0/0 +NC_000962.3 4179089 . C C . . . GT 0/0 +NC_000962.3 4215484 . G G . . . GT 0/0 +NC_000962.3 4217557 . A A . . . GT 0/0 +NC_000962.3 4251297 . G G . . . GT 0/0 +NC_000962.3 4287164 . A A . . . GT 0/0 +NC_000962.3 4313128 . C C . . . GT 0/0 +NC_000962.3 4329782 . G G . . . GT 0/0 +NC_000962.3 4372353 . G G . . . GT 0/0 +NC_000962.3 4383655 . A A . . . GT 0/0 +NC_000962.3 4384007 . C C . . . GT 0/0 +NC_000962.3 4407588 . T T . . . GT 0/0 +NC_000962.3 4408923 . C C . . . GT 0/0 diff --git a/workflow/rules/mtb_prepare_files.smk b/workflow/rules/mtb_prepare_files.smk index 0d13f0f..071afbd 100644 --- a/workflow/rules/mtb_prepare_files.smk +++ b/workflow/rules/mtb_prepare_files.smk @@ -45,6 +45,31 @@ rule copy_sample_vcf: cp {input.vcf} {output.vcf} 2>&1>{log} """ +# This rule adds VCF entries needed for lineage typing to the normal VCF +# The VCF entries are only added if missing and contain only reference calls (0/0 Genotype) +# fast-lineage-caller only considers variants in the VCF so these need to be added explicitly +rule prepare_lineage_typing_vcf: + input: + lineage4_vcf="files/mtb/lineage4_lipworth.vcf", + vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf" + output: + vcf=temp(OUT + "/mtb_typing/prepared_files/lineage_typing_vcf/{sample}.vcf"), + conda: + "../envs/gatk_picard.yaml" + container: + "docker://staphb/bedtools:2.30.0" + log: + OUT + "/log/prepare_lineage_typing_vcf/{sample}.log" + threads: config["threads"]["bedtools"] + resources: + mem_gb=config["mem_gb"]["bedtools"] + shell: + """ +cat {input.vcf} <(bedtools intersect -a {input.lineage4_vcf} -b {input.vcf} -v) | \ +bedtools sort 1> {output.vcf} \ +2>{log} + """ + rule index_sample_vcf_gatk: input: diff --git a/workflow/rules/mtb_typing.smk b/workflow/rules/mtb_typing.smk index e67ecbc..739e6fe 100644 --- a/workflow/rules/mtb_typing.smk +++ b/workflow/rules/mtb_typing.smk @@ -1,6 +1,6 @@ rule mtb_lineage_id: input: - vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf", + vcf=OUT + "/mtb_typing/prepared_files/lineage_typing_vcf/{sample}.vcf", output: tsv=temp(OUT + "/mtb_typing/lineage_call_standard/{sample}.tsv"), conda: diff --git a/workflow/scripts/combine_lineage_calls.py b/workflow/scripts/combine_lineage_calls.py index 4683f47..e4197a3 100644 --- a/workflow/scripts/combine_lineage_calls.py +++ b/workflow/scripts/combine_lineage_calls.py @@ -4,50 +4,85 @@ import pandas as pd import re +# define lineage4 sublineages from PMID 30789126 +lineage4_sublineages = ["ghana", "xtype", "haarlem", "ural", "tur", "lam", "stype", "uganda", "cameroon"] + +def parse_str_to_dict(lineage_str): + lineages = lineage_str.split(",") + parsed_data = {} + for lineage in lineages: + # get name of the lineage + lineage_name = lineage.split("(")[0] + snp_string = lineage.split("(")[1].split(")")[0] + # get the count of identified snps and count of total snps + snp_counts = int(snp_string.split("/")[0]) + snp_total = int(snp_string.split("/")[1]) + snp_ratio = snp_counts / snp_total + parsed_data[lineage_name] = (snp_counts, snp_total, snp_ratio) + return parsed_data + +def get_highest_ratio(lineage_dict): + if len(lineage_dict) == 1: + return list(lineage_dict.keys()) + elif len(lineage_dict) == 0: + return [] + else: + # get the highest ratio + max_ratio = max([lineage_dict[lineage][2] for lineage in lineage_dict.keys()]) + # get the lineage(s) with the highest ratio + max_lineages = [lineage for lineage in lineage_dict.keys() if lineage_dict[lineage][2] == max_ratio] + return max_lineages + +def decide_type(lineage_dict, sublineages): + output = [] + max_lineages = get_highest_ratio(lineage_dict) + for lineage in max_lineages: + output.append(f"{lineage}({lineage_dict[lineage][0]}/{lineage_dict[lineage][1]})") + if lineage == "lineage4": + # find the next highest after lineage4 + lineage_dict_copy = lineage_dict.copy() + lineage_dict_copy.pop("lineage4") + max_lineages_without_lineage4 = get_highest_ratio(lineage_dict_copy) + for sublineage in max_lineages_without_lineage4: + if sublineage in sublineages: + output.append(f"{sublineage}({lineage_dict[sublineage][0]}/{lineage_dict[sublineage][1]})") + return ",".join(sorted(output)) def parse_value(value): + """ + Parse a column from fast-lineage-caller + + Results have a format: typeA(a/b)[,typeB(x/y)] + + Where: + - typeA and typeB are the identified lineage + - a and x are the number of identified SNPs for lineages typeA and typeB, resp. + - b and y are the number of lineage-specific SNPs in the schemes of typeA and typeB, resp. + + The ancestral lineage4, to which the reference genome belongs, is identified by conserved positions. + If another type is confidently identified in addition to lineage4, this means the other type is preferred. + Only if lineage4 is the sole confidently identified type, it should be outputted + """ if not isinstance(value, str): - outcome = value + top_hit = value elif value == "NA": - outcome = "NA" - elif len(value.split(",")) == 1: - # remove string matching pattern "([0-9]+/[0-9]+)" from value - outcome = re.sub(r"\([0-9]+/[0-9]+\)", "", value) + top_hit = "NA" else: - # make a list of identified types - list_of_types = value.split(",") - - max_ratio = -1 - list_types_with_max_ratio = [] - - for item in list_of_types: - # get type name by removing count matching pattern "([0-9]+/[0-9]+)" - type_name = re.sub(r"\([0-9]+/[0-9]+\)", "", item) - # get count matching pattern "([0-9]+/[0-9]+)" - count_str = re.search(r"\([0-9]+/[0-9]+\)", item).group() - # get first and second number from count_str, - # indicating identified nr of SNPs and total nr of SNPs for that type - counts = count_str.strip("()").split("/") - ratio_type = int(counts[0]) / int(counts[1]) - if ratio_type > max_ratio: - max_ratio = ratio_type - list_types_with_max_ratio = [type_name] - elif ratio_type == max_ratio: - list_types_with_max_ratio.append(type_name) - - # get type with highest ratio - outcome = ",".join(list_types_with_max_ratio) - - return outcome + parsed_lineages = parse_str_to_dict(value) + top_hit = decide_type(parsed_lineages, lineage4_sublineages) + return top_hit def parse_counts(df_input): df = df_input.copy() df.columns = [f"{col}_counts" if col != "Isolate" else col for col in df.columns] + list_warnings = [] for col in df.columns[1:]: # remove _counts from column name col_name = col.replace("_counts", "") # get string value from first row, column matching col - df[col_name] = parse_value(df[col].iloc[0]) + best_type = parse_value(df[col].iloc[0]) + df[col_name] = best_type + return df From 514d8067666598cc9fce071f80e3cedc9b51197e Mon Sep 17 00:00:00 2001 From: boasvdp Date: Tue, 3 Sep 2024 11:09:53 +0200 Subject: [PATCH 6/9] fix: dont use contig number in consensus fasta header --- workflow/rules/consensus.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/consensus.smk b/workflow/rules/consensus.smk index 07c2ca7..1163176 100644 --- a/workflow/rules/consensus.smk +++ b/workflow/rules/consensus.smk @@ -231,7 +231,7 @@ rule replace_fasta_header: """ seqkit replace \ -p ^ \ --r '{wildcards.sample}_contig{{nr}} ' \ +-r '{wildcards.sample} ' \ {input.fasta} \ 1> {output.fasta} \ 2>{log} From 523a1571d99f15edcdd688241d6a8565414c7508 Mon Sep 17 00:00:00 2001 From: boasvdp Date: Mon, 9 Sep 2024 09:11:34 +0200 Subject: [PATCH 7/9] fix: fix run_pipeline.sh --- run_pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_pipeline.sh b/run_pipeline.sh index 56cb9f6..531b56f 100755 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -25,7 +25,7 @@ fi #----------------------------------------------# # Create/update necessary environments PATH_MAMBA_YAML="envs/mamba.yaml" -PATH_MASTER_YAML="envs/template_master.yaml" +PATH_MASTER_YAML="envs/juno_variant_typing.yaml" MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ') MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ') From e6dbab75181cda89e780eb0428833526683a0755 Mon Sep 17 00:00:00 2001 From: boasvdp Date: Mon, 9 Sep 2024 10:19:29 +0200 Subject: [PATCH 8/9] fix: fix run_pipeline.sh --- run_pipeline.sh | 93 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 74 insertions(+), 19 deletions(-) diff --git a/run_pipeline.sh b/run_pipeline.sh index 531b56f..aa2c468 100755 --- a/run_pipeline.sh +++ b/run_pipeline.sh @@ -2,18 +2,29 @@ set -euo pipefail +set -x + #----------------------------------------------# # User parameters -if [ ! -z "${1}" ] || [ ! -z "${2}" ] #|| [ ! -z "${irods_input_projectID}" ] +if [ ! -z "${1}" ] || [ ! -z "${2}" ] || [ ! -z "${irods_input_projectID}" ] then input_dir="${1}" output_dir="${2}" -# PROJECT_NAME="${irods_input_projectID}" + PROJECT_NAME="${irods_input_projectID}" + EXCLUSION_FILE="" else echo "One of the parameters is missing, make sure there is an input directory, output directory and project name(param 1, 2 or irods_input_projectID)." exit 1 fi +set +u +#check if there is an exclusion file, if so change the parameter +if [ ! -z "${irods_input_sequencing__run_id}" ] && [ -f "/data/BioGrid/NGSlab/sample_sheets/${irods_input_sequencing__run_id}.exclude" ] +then + EXCLUSION_FILE="/data/BioGrid/NGSlab/sample_sheets/${irods_input_sequencing__run_id}.exclude" +fi +set -u + if [ ! -d "${input_dir}" ] || [ ! -d "${output_dir}" ] then echo "The input directory $input_dir, output directory $output_dir or fastq dir ${input_dir}/clean_fastq does not exist" @@ -23,29 +34,50 @@ else fi #----------------------------------------------# -# Create/update necessary environments -PATH_MAMBA_YAML="envs/mamba.yaml" -PATH_MASTER_YAML="envs/juno_variant_typing.yaml" -MAMBA_NAME=$(head -n 1 ${PATH_MAMBA_YAML} | cut -f2 -d ' ') -MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ') +## make sure conda works -echo -e "\nUpdating necessary environments to run the pipeline..." - -# Removing strict mode because it sometimes breaks the code for -# activating an environment and for testing whether some variables -# are set or not -set +euo pipefail +# >>> conda initialize >>> +# !! Contents within this block are managed by 'conda init' !! +__conda_setup="$('/mnt/miniconda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" +if [ $? -eq 0 ]; then + eval "$__conda_setup" +else + if [ -f "/mnt/miniconda/etc/profile.d/conda.sh" ]; then + . "/mnt/miniconda/etc/profile.d/conda.sh" + else + export PATH="/mnt/miniconda/bin:$PATH" + fi +fi +unset __conda_setup +# <<< conda initialize << Date: Thu, 12 Sep 2024 10:51:22 +0200 Subject: [PATCH 9/9] style: formatting --- workflow/rules/mtb_prepare_files.smk | 7 +++-- workflow/rules/mtb_typing.smk | 6 ++-- workflow/scripts/combine_lineage_calls.py | 34 +++++++++++++++++++---- 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/workflow/rules/mtb_prepare_files.smk b/workflow/rules/mtb_prepare_files.smk index 071afbd..18f78e4 100644 --- a/workflow/rules/mtb_prepare_files.smk +++ b/workflow/rules/mtb_prepare_files.smk @@ -45,13 +45,14 @@ rule copy_sample_vcf: cp {input.vcf} {output.vcf} 2>&1>{log} """ + # This rule adds VCF entries needed for lineage typing to the normal VCF # The VCF entries are only added if missing and contain only reference calls (0/0 Genotype) # fast-lineage-caller only considers variants in the VCF so these need to be added explicitly rule prepare_lineage_typing_vcf: input: lineage4_vcf="files/mtb/lineage4_lipworth.vcf", - vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf" + vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf", output: vcf=temp(OUT + "/mtb_typing/prepared_files/lineage_typing_vcf/{sample}.vcf"), conda: @@ -59,10 +60,10 @@ rule prepare_lineage_typing_vcf: container: "docker://staphb/bedtools:2.30.0" log: - OUT + "/log/prepare_lineage_typing_vcf/{sample}.log" + OUT + "/log/prepare_lineage_typing_vcf/{sample}.log", threads: config["threads"]["bedtools"] resources: - mem_gb=config["mem_gb"]["bedtools"] + mem_gb=config["mem_gb"]["bedtools"], shell: """ cat {input.vcf} <(bedtools intersect -a {input.lineage4_vcf} -b {input.vcf} -v) | \ diff --git a/workflow/rules/mtb_typing.smk b/workflow/rules/mtb_typing.smk index 739e6fe..aa65dde 100644 --- a/workflow/rules/mtb_typing.smk +++ b/workflow/rules/mtb_typing.smk @@ -23,6 +23,7 @@ fast-lineage-caller \ 2>&1>{log} """ + rule mtb_lineage_id_custom: input: vcf=OUT + "/mtb_typing/prepared_files/{sample}.vcf", @@ -40,7 +41,7 @@ rule mtb_lineage_id_custom: resources: mem_gb=config["mem_gb"]["fast-lineage-caller"], params: - scheme = "files/mtb/snpCL_scheme.tsv", + scheme="files/mtb/snpCL_scheme.tsv", shell: """ fast-lineage-caller \ @@ -50,13 +51,14 @@ fast-lineage-caller \ 2>&1>{log} """ + rule combine_lineage_typing: input: lineage_standard=OUT + "/mtb_typing/lineage_call_standard/{sample}.tsv", lineage_custom=OUT + "/mtb_typing/lineage_call_custom/{sample}.tsv", output: tsv=OUT + "/mtb_typing/lineage_call/{sample}.tsv", - conda: + conda: "../envs/scripts.yaml" log: OUT + "/log/combine_lineage_typing/{sample}.log", diff --git a/workflow/scripts/combine_lineage_calls.py b/workflow/scripts/combine_lineage_calls.py index e4197a3..fe02177 100644 --- a/workflow/scripts/combine_lineage_calls.py +++ b/workflow/scripts/combine_lineage_calls.py @@ -5,7 +5,18 @@ import re # define lineage4 sublineages from PMID 30789126 -lineage4_sublineages = ["ghana", "xtype", "haarlem", "ural", "tur", "lam", "stype", "uganda", "cameroon"] +lineage4_sublineages = [ + "ghana", + "xtype", + "haarlem", + "ural", + "tur", + "lam", + "stype", + "uganda", + "cameroon", +] + def parse_str_to_dict(lineage_str): lineages = lineage_str.split(",") @@ -21,6 +32,7 @@ def parse_str_to_dict(lineage_str): parsed_data[lineage_name] = (snp_counts, snp_total, snp_ratio) return parsed_data + def get_highest_ratio(lineage_dict): if len(lineage_dict) == 1: return list(lineage_dict.keys()) @@ -30,14 +42,21 @@ def get_highest_ratio(lineage_dict): # get the highest ratio max_ratio = max([lineage_dict[lineage][2] for lineage in lineage_dict.keys()]) # get the lineage(s) with the highest ratio - max_lineages = [lineage for lineage in lineage_dict.keys() if lineage_dict[lineage][2] == max_ratio] + max_lineages = [ + lineage + for lineage in lineage_dict.keys() + if lineage_dict[lineage][2] == max_ratio + ] return max_lineages + def decide_type(lineage_dict, sublineages): output = [] max_lineages = get_highest_ratio(lineage_dict) for lineage in max_lineages: - output.append(f"{lineage}({lineage_dict[lineage][0]}/{lineage_dict[lineage][1]})") + output.append( + f"{lineage}({lineage_dict[lineage][0]}/{lineage_dict[lineage][1]})" + ) if lineage == "lineage4": # find the next highest after lineage4 lineage_dict_copy = lineage_dict.copy() @@ -45,9 +64,12 @@ def decide_type(lineage_dict, sublineages): max_lineages_without_lineage4 = get_highest_ratio(lineage_dict_copy) for sublineage in max_lineages_without_lineage4: if sublineage in sublineages: - output.append(f"{sublineage}({lineage_dict[sublineage][0]}/{lineage_dict[sublineage][1]})") + output.append( + f"{sublineage}({lineage_dict[sublineage][0]}/{lineage_dict[sublineage][1]})" + ) return ",".join(sorted(output)) + def parse_value(value): """ Parse a column from fast-lineage-caller @@ -72,6 +94,7 @@ def parse_value(value): top_hit = decide_type(parsed_lineages, lineage4_sublineages) return top_hit + def parse_counts(df_input): df = df_input.copy() df.columns = [f"{col}_counts" if col != "Isolate" else col for col in df.columns] @@ -94,7 +117,6 @@ def main(args): df_custom = pd.read_csv(args.custom, sep="\t") - # merge all dataframes horizontally on Isolate merged_df = pd.merge(df_standard_parsed, df_custom, on="Isolate") @@ -112,4 +134,4 @@ def main(args): args = parser.parse_args() - main(args) \ No newline at end of file + main(args)