From 4b7b9e29a9bd0cbd89e2e629c21d37a631ad8bd3 Mon Sep 17 00:00:00 2001 From: js2264 Date: Wed, 14 Feb 2024 20:32:12 +0100 Subject: [PATCH 1/5] feat: print out PCR% by logger --- hicstuff/pipeline.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py index 7a996e3..2309dd1 100644 --- a/hicstuff/pipeline.py +++ b/hicstuff/pipeline.py @@ -312,6 +312,9 @@ def filter_pcr_dup(pairs_idx_file, filtered_file): "%d%% PCR duplicates have been filtered out (%d / %d pairs) " % (100 * round(filter_count / reads_count, 3), filter_count, reads_count) ) + logger.info( + "%d pairs remaining after removing PCR duplicates", reads_count - filter_count + ) def pairs2cool(pairs_file, cool_file, bins_file, exclude): From e78e7ea912f73c01f099324280b969a9208dd33c Mon Sep 17 00:00:00 2001 From: js2264 Date: Wed, 14 Feb 2024 20:33:09 +0100 Subject: [PATCH 2/5] feat: print out which pairs file (and number of pairs) is used when parsing into cool file --- hicstuff/pipeline.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py index 2309dd1..245d6fd 100644 --- a/hicstuff/pipeline.py +++ b/hicstuff/pipeline.py @@ -1070,6 +1070,19 @@ def _out_file(fname): # Build matrix from pairs. if mat_fmt == "cool": + # Log which pairs file is being used and how many pairs are listed + pairs_count = 0 + with open(use_pairs, "r") as file: + for line in file: + if line.startswith('#'): + continue + else: + pairs_count += 1 + logger.info( + "Generating matrix from pairs file %s (%d pairs in the file) ", + use_pairs, pairs_count + ) + # Name matrix file in .cool mat = os.path.splitext(mat)[0] + ".cool" From 680e56a248ab75f12883e95a223f48a9abad6044 Mon Sep 17 00:00:00 2001 From: js2264 Date: Thu, 15 Feb 2024 09:45:04 +0100 Subject: [PATCH 3/5] fix: count reads in fastq without zcat --- hicstuff/io.py | 22 +++++----------------- hicstuff/pipeline.py | 3 ++- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/hicstuff/io.py b/hicstuff/io.py index 9266c9e..e60110a 100644 --- a/hicstuff/io.py +++ b/hicstuff/io.py @@ -1435,7 +1435,7 @@ def check_is_fasta(in_file): def check_fastq_entries(in_file): """ - Check how many reads are in the input fastq. Requires zcat. + Check how many reads are in the input fastq. Parameters ---------- @@ -1450,24 +1450,12 @@ def check_fastq_entries(in_file): with open(in_file, 'rb') as f: is_gzip = f.read(2) == b'\x1f\x8b' - if is_gzip: - n_lines = sp.run( - "zcat < {f} | wc -l".format(f = in_file), - stdout=sp.PIPE, - stderr=sp.PIPE, - shell = True, - encoding = 'utf-8' - ).stdout[:-2].split(" ")[0] + with gzip.open(in_file, "rt") as input_fastq: + n_lines = sum(1 for line in input_fastq) else: - n_lines = sp.run( - "wc -l {f}".format(f = in_file), - stdout=sp.PIPE, - stderr=sp.PIPE, - shell = True, - encoding = 'utf-8' - ).stdout[:-2].split(" ")[0] - + with open(in_file, "r") as input_fastq: + n_lines = sum(1 for line in input_fastq) n_reads = int(n_lines)/4 return n_reads diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py index 245d6fd..81988c3 100644 --- a/hicstuff/pipeline.py +++ b/hicstuff/pipeline.py @@ -886,6 +886,7 @@ def _out_file(fname): pairs_idx = input1 # Perform genome alignment + nreads_input1 = 0 if start_stage == 0: # Check number of reads in both fastqs @@ -895,7 +896,7 @@ def _out_file(fname): if (nreads_input1 != nreads_input2): logger.error("Fastq files do not have the same number of reads.") else: - logger.info("{n} reads found in each fastq file.".format(n = nreads_input1)) + logger.info("{n} reads found in each fastq file.".format(n = int(nreads_input1))) # Define mapping choice (default normal): if mapping == "normal": From 294f8f3d95378b32fecfcc873eacc83224ade2ce Mon Sep 17 00:00:00 2001 From: js2264 Date: Thu, 15 Feb 2024 09:45:38 +0100 Subject: [PATCH 4/5] feat: add stats module --- hicstuff/pipeline.py | 29 ++++++++ hicstuff/stats.py | 154 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 hicstuff/stats.py diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py index 81988c3..e022e4e 100644 --- a/hicstuff/pipeline.py +++ b/hicstuff/pipeline.py @@ -23,6 +23,7 @@ import hicstuff.io as hio import hicstuff.distance_law as hcdl import hicstuff.cutsite as hcc +import hicstuff.stats as hcs import matplotlib import pathlib from hicstuff.version import __version__ @@ -1010,6 +1011,26 @@ def _out_file(fname): ) os.rename(pairs_idx + ".sorted", pairs_idx) + # Count total pairs + tot_pairs = 0 + with open(pairs_idx, "r") as file: + for line in file: + if line.startswith('#'): + continue + else: + tot_pairs += 1 + if nreads_input1 != 0: + logger.info( + "{0} pairs successfully mapped ({1}%)".format( + tot_pairs, round(100 * tot_pairs / (nreads_input1), 2) + ) + ) + else: + logger.info( + "{0} pairs successfully mapped".format(tot_pairs) + ) + + # Filter pairs if requested if filter_events: uncut_thr, loop_thr = hcf.get_thresholds( pairs_idx, plot_events=plot, fig_path=dist_plot, prefix=prefix @@ -1116,6 +1137,14 @@ def _out_file(fname): tmp_dir=tmp_dir, ) + # Get stats on the pipeline + try: + logger.info("Fetching mapping and pairing stats") + hcs.get_pipeline_stats(prefix, out_dir, log_file) + except IndexError: + logger.warning("IndexError. Stats not compiled.") + pass + # Move final pairs file to main dir. p = pathlib.Path(use_pairs).absolute() pairsf = p.parents[1] / p.name diff --git a/hicstuff/stats.py b/hicstuff/stats.py new file mode 100644 index 0000000..3cce2e9 --- /dev/null +++ b/hicstuff/stats.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import re +from os.path import join +import json + +def get_pipeline_stats(prefix, out_dir, log_file): + """Get stats after pipeline execution. + + Parameters + ---------- + prefix : str + The prefix used to create output files by the pipeline. + out_dir : str + The prefix used to create output files by the pipeline. + log_file : str + Path to hicstuff log file. + + Returns + ------- + txt file: + A single text file containing stats about hicstuff pipeline execution: + - Number of sequenced pairs from fastqs + - Number (% of total reads) of unmapped reads + - Number (% of total reads) of mapped reads + - Number (% of total reads) of pairs + - Number (% of total pairs) of filtered pairs + - Number (% of total pairs) of loops + - Number (% of total pairs) of uncuts + - Number (% of total pairs) of abnormal (-- and ++) + - Number (% of total pairs) of deduplicated pairs [Number (% of total pairs) of PCR duplicates] + - From all pairs used in contact matrix: + - Number (% of matrix) of cis pairs + - Number (% of matrix) of trans pairs + - Trans ratio + """ + + with open(log_file) as file: + log_lines = [line.rstrip() for line in file] + + # 1. Number of sequenced pairs from fastqs + fastq_pairs = [s for s in log_lines if re.search("reads found in eajch fastq file.", s)][0] + fastq_pairs = re.sub(".*INFO :: ", "", fastq_pairs) + fastq_pairs = re.sub(" reads found in each fastq file.*", "", fastq_pairs) + fastq_pairs = int(float(fastq_pairs)) + + # 2. Number (% of total) of (un)mapped reads + tot_mapped = [s for s in log_lines if re.search("mapped with Q ", s)][0] + tot_mapped = re.sub(".*Q >= 30 \(", "", tot_mapped) + tot_mapped = re.sub("/.*", "", tot_mapped) + tot_mapped = int(float(tot_mapped)) + tot_unmapped = fastq_pairs*2 - tot_mapped + pct_mapped = round(tot_mapped/(fastq_pairs*2)*100, 2) + pct_unmapped = round(tot_unmapped/(fastq_pairs*2)*100, 2) + + # 3. Number (% of total) of pairs + tot_pairs = [s for s in log_lines if re.search("pairs successfully mapped", s)][0] + tot_pairs = re.sub(".*INFO :: ", "", tot_pairs) + tot_pairs = re.sub(" pairs successfully.*", "", tot_pairs) + tot_pairs = int(float(tot_pairs)) + pct_pairs = round(tot_pairs/fastq_pairs*100, 2) + + # 4. Number (% of total) of filtered pairs + filtered_pairs = [s for s in log_lines if re.search("pairs discarded:", s)] + if (len(filtered_pairs) > 0): + filtered_pairs = filtered_pairs[0] + loops_pairs = re.sub(".*Loops: ", "", filtered_pairs) + loops_pairs = re.sub(", Uncuts:.*", "", loops_pairs) + loops_pairs = int(float(loops_pairs)) + uncuts_pairs = re.sub(".*Uncuts: ", "", filtered_pairs) + uncuts_pairs = re.sub(", Weirds:.*", "", uncuts_pairs) + uncuts_pairs = int(float(uncuts_pairs)) + abnormal_pairs = re.sub(".*Weirds: ", "", filtered_pairs) + abnormal_pairs = int(float(abnormal_pairs)) + filtered_pairs = re.sub(".*INFO :: ", "", filtered_pairs) + filtered_pairs = re.sub(" pairs discarded.*", "", filtered_pairs) + filtered_pairs = int(float(filtered_pairs)) + else: + loops_pairs=0 + uncuts_pairs=0 + abnormal_pairs=0 + filtered_pairs=0 + pct_filtered = round(filtered_pairs/tot_pairs*100, 2) + pct_loops_pairs = round(loops_pairs/tot_pairs*100, 2) + pct_uncuts_pairs = round(uncuts_pairs/tot_pairs*100, 2) + pct_abnormal_pairs = round(abnormal_pairs/tot_pairs*100, 2) + + # 5. Number (% of total) of PCR dups pairs + PCR_pairs = [s for s in log_lines if re.search("PCR duplicates have been filtered", s)] + if (len(PCR_pairs) > 0): + PCR_pairs = PCR_pairs[0] + PCR_pairs = re.sub(".*have been filtered out \(", "", PCR_pairs) + PCR_pairs = re.sub(" / .*", "", PCR_pairs) + PCR_pairs = int(float(PCR_pairs)) + else: + PCR_pairs = 0 + pct_PCR = round(PCR_pairs/tot_pairs*100, 2) + + # 6. Number (%) of final pairs + kept_pairs=tot_pairs-filtered_pairs-PCR_pairs + pct_kept = round(kept_pairs/tot_pairs*100, 2) + + # Open the log file and read its contents + stats_file_path = join(out_dir, prefix + ".stats.txt") + with open(stats_file_path, 'w') as stats_file: + stats_file.write("Sample: {prefix}\n".format(prefix = prefix)) + stats_file.write("Total read pairs: {reads}\n".format(reads = "{:,}".format(fastq_pairs))) + stats_file.write("Mapped reads: {tot_mapped} ({pct_mapped}% of all reads)\n".format( + tot_mapped = "{:,}".format(tot_mapped), + pct_mapped = pct_mapped + ) + ) + stats_file.write("Unmapped reads: {tot_unmapped} ({pct_unmapped}% of all reads)\n".format( + tot_unmapped = "{:,}".format(tot_unmapped), pct_unmapped = pct_unmapped + )) + stats_file.write("Recovered contacts: {tot_pairs} ({pct_pairs}% of all read pairs)\n".format( + tot_pairs = "{:,}".format(tot_pairs), pct_pairs = pct_pairs + )) + stats_file.write("Final contacts: {kept_pairs} ({pct_kept}% of all contacts)\n".format( + kept_pairs = "{:,}".format(kept_pairs), pct_kept = pct_kept + )) + stats_file.write(" Filtered out: {filtered_pairs} ({pct_filtered}% of all contacts)\n".format( + filtered_pairs = "{:,}".format(filtered_pairs), pct_filtered = pct_filtered + )) + stats_file.write(" Loops: {loops_pairs} ({pct_loops_pairs}% of all contacts)\n".format( + loops_pairs = "{:,}".format(loops_pairs), pct_loops_pairs = pct_loops_pairs + )) + stats_file.write(" Uncuts: {uncuts_pairs} ({pct_uncuts_pairs}% of all contacts)\n".format( + uncuts_pairs = "{:,}".format(uncuts_pairs), pct_uncuts_pairs = pct_uncuts_pairs + )) + stats_file.write(" Weirds: {abnormal_pairs} ({pct_abnormal_pairs}% of all contacts)\n".format( + abnormal_pairs = "{:,}".format(abnormal_pairs), pct_abnormal_pairs = pct_abnormal_pairs + )) + stats_file.write(" PCR duplicates: {PCR_pairs} ({pct_PCR}% of all contacts)\n".format( + PCR_pairs = "{:,}".format(PCR_pairs), pct_PCR = pct_PCR + )) + + stats = { + 'Total read pairs': fastq_pairs, + 'Mapped reads': tot_mapped, + 'Unmapped reads': tot_unmapped, + 'Recovered contacts': tot_pairs, + 'Final contacts': kept_pairs, + 'Filtered out': filtered_pairs, + 'Loops': loops_pairs, + 'Uncuts': uncuts_pairs, + 'Weirds': abnormal_pairs, + 'PCR duplicates': PCR_pairs + } + stats_json_path = join(out_dir, prefix + ".stats.json") + with open(stats_json_path, 'w') as json_file: + json.dump(stats, json_file, indent=4) + From 297665ee5c6c779587f630f8501e4ddd35b53525 Mon Sep 17 00:00:00 2001 From: js2264 Date: Thu, 15 Feb 2024 10:23:14 +0100 Subject: [PATCH 5/5] bump to v3.2.2 --- Dockerfile | 2 +- doc/conf.py | 2 +- hicstuff/version.py | 2 +- setup.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index d8882fd..73673e0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM mambaorg/micromamba:latest -LABEL Name=hicstuff Version=3.2.1 +LABEL Name=hicstuff Version=3.2.2 COPY --chown=$MAMBA_USER:$MAMBA_USER . ./ diff --git a/doc/conf.py b/doc/conf.py index ce44702..82ba5aa 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -26,7 +26,7 @@ # The short X.Y version version = "3.2" # The full version, including alpha/beta/rc tags -release = "3.2.1" +release = "3.2.2" # -- General configuration --------------------------------------------------- diff --git a/hicstuff/version.py b/hicstuff/version.py index b50da94..29e4a94 100644 --- a/hicstuff/version.py +++ b/hicstuff/version.py @@ -1 +1 @@ -__version__ = '3.2.1' +__version__ = '3.2.2' diff --git a/setup.py b/setup.py index 0f245ea..4ec3651 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ MAJOR = 3 MINOR = 2 -MAINTENANCE = 1 +MAINTENANCE = 2 VERSION = "{}.{}.{}".format(MAJOR, MINOR, MAINTENANCE) LICENSE = "BSD-3-Clause"