koszullab · js2264 · Feb 15, 2024 · Feb 14, 2024 · Feb 14, 2024 · Feb 15, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,6 @@
 FROM mambaorg/micromamba:latest
 
-LABEL Name=hicstuff Version=3.2.1
+LABEL Name=hicstuff Version=3.2.2
 
 COPY --chown=$MAMBA_USER:$MAMBA_USER . ./
 

diff --git a/doc/conf.py b/doc/conf.py
@@ -26,7 +26,7 @@
 # The short X.Y version
 version = "3.2"
 # The full version, including alpha/beta/rc tags
-release = "3.2.1"
+release = "3.2.2"
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/hicstuff/io.py b/hicstuff/io.py
@@ -1435,7 +1435,7 @@ def check_is_fasta(in_file):
 
 def check_fastq_entries(in_file):
     """
-    Check how many reads are in the input fastq. Requires zcat.
+    Check how many reads are in the input fastq. 
 
     Parameters
     ----------
@@ -1450,24 +1450,12 @@ def check_fastq_entries(in_file):
 
     with open(in_file, 'rb') as f:
         is_gzip = f.read(2) == b'\x1f\x8b'
-
     if is_gzip:
-        n_lines = sp.run(
-            "zcat < {f} | wc -l".format(f = in_file),
-            stdout=sp.PIPE,
-            stderr=sp.PIPE,
-            shell = True, 
-            encoding = 'utf-8'
-        ).stdout[:-2].split(" ")[0]
+        with gzip.open(in_file, "rt") as input_fastq:
+            n_lines = sum(1 for line in input_fastq)
     else:
-        n_lines = sp.run(
-            "wc -l {f}".format(f = in_file),
-            stdout=sp.PIPE,
-            stderr=sp.PIPE,
-            shell = True, 
-            encoding = 'utf-8'
-        ).stdout[:-2].split(" ")[0]
-
+        with open(in_file, "r") as input_fastq:
+            n_lines = sum(1 for line in input_fastq)
     n_reads = int(n_lines)/4
     return n_reads
 

diff --git a/hicstuff/pipeline.py b/hicstuff/pipeline.py
@@ -23,6 +23,7 @@
 import hicstuff.io as hio
 import hicstuff.distance_law as hcdl
 import hicstuff.cutsite as hcc
+import hicstuff.stats as hcs
 import matplotlib
 import pathlib
 from hicstuff.version import __version__
@@ -312,6 +313,9 @@ def filter_pcr_dup(pairs_idx_file, filtered_file):
             "%d%% PCR duplicates have been filtered out (%d / %d pairs) "
             % (100 * round(filter_count / reads_count, 3), filter_count, reads_count)
         )
+        logger.info(
+            "%d pairs remaining after removing PCR duplicates", reads_count - filter_count
+        )
 
 
 def pairs2cool(pairs_file, cool_file, bins_file, exclude):
@@ -883,6 +887,7 @@ def _out_file(fname):
         pairs_idx = input1
 
     # Perform genome alignment
+    nreads_input1 = 0
     if start_stage == 0:
 
         # Check number of reads in both fastqs
@@ -892,7 +897,7 @@ def _out_file(fname):
         if (nreads_input1 != nreads_input2):
             logger.error("Fastq files do not have the same number of reads.")
         else:
-            logger.info("{n} reads found in each fastq file.".format(n = nreads_input1))
+            logger.info("{n} reads found in each fastq file.".format(n = int(nreads_input1)))
 
         # Define mapping choice (default normal):
         if mapping == "normal":
@@ -1006,6 +1011,26 @@ def _out_file(fname):
     )
     os.rename(pairs_idx + ".sorted", pairs_idx)
 
+    # Count total pairs
+    tot_pairs = 0
+    with open(pairs_idx, "r") as file:
+        for line in file:
+            if line.startswith('#'):
+                continue
+            else:
+                tot_pairs += 1
+    if nreads_input1 != 0:
+        logger.info(
+            "{0} pairs successfully mapped ({1}%)".format(
+                tot_pairs, round(100 * tot_pairs / (nreads_input1), 2)
+            )
+        )
+    else:
+        logger.info(
+            "{0} pairs successfully mapped".format(tot_pairs)
+        )
+
+    # Filter pairs if requested
     if filter_events:
         uncut_thr, loop_thr = hcf.get_thresholds(
             pairs_idx, plot_events=plot, fig_path=dist_plot, prefix=prefix
@@ -1067,6 +1092,19 @@ def _out_file(fname):
     # Build matrix from pairs.
     if mat_fmt == "cool":
 
+        # Log which pairs file is being used and how many pairs are listed
+        pairs_count = 0
+        with open(use_pairs, "r") as file:
+            for line in file:
+                if line.startswith('#'):
+                    continue
+                else:
+                    pairs_count += 1
+        logger.info(
+            "Generating matrix from pairs file %s (%d pairs in the file) ", 
+            use_pairs, pairs_count
+        )
+
         # Name matrix file in .cool
         mat = os.path.splitext(mat)[0] + ".cool"
 
@@ -1099,6 +1137,14 @@ def _out_file(fname):
             tmp_dir=tmp_dir,
         )
 
+    # Get stats on the pipeline
+    try:
+        logger.info("Fetching mapping and pairing stats")
+        hcs.get_pipeline_stats(prefix, out_dir, log_file)
+    except IndexError: 
+        logger.warning("IndexError. Stats not compiled.")
+        pass 
+
     # Move final pairs file to main dir. 
     p = pathlib.Path(use_pairs).absolute()
     pairsf = p.parents[1] / p.name

diff --git a/hicstuff/stats.py b/hicstuff/stats.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import re 
+from os.path import join
+import json
+
+def get_pipeline_stats(prefix, out_dir, log_file):
+    """Get stats after pipeline execution.
+
+    Parameters
+    ----------
+    prefix : str
+        The prefix used to create output files by the pipeline.
+    out_dir : str
+        The prefix used to create output files by the pipeline.
+    log_file : str
+        Path to hicstuff log file.
+
+    Returns
+    -------
+    txt file:
+        A single text file containing stats about hicstuff pipeline execution: 
+        - Number of sequenced pairs from fastqs
+        - Number (% of total reads) of unmapped reads 
+        - Number (% of total reads) of mapped reads 
+        - Number (% of total reads) of pairs 
+        - Number (% of total pairs) of filtered pairs 
+            - Number (% of total pairs) of loops
+            - Number (% of total pairs) of uncuts
+            - Number (% of total pairs) of abnormal (-- and ++)
+        - Number (% of total pairs) of deduplicated pairs [Number (% of total pairs) of PCR duplicates]
+        - From all pairs used in contact matrix: 
+            - Number (% of matrix) of cis pairs
+            - Number (% of matrix) of trans pairs
+            - Trans ratio
+    """
+
+    with open(log_file) as file:
+        log_lines = [line.rstrip() for line in file]
+
+    # 1. Number of sequenced pairs from fastqs
+    fastq_pairs = [s for s in log_lines if re.search("reads found in eajch fastq file.", s)][0]
+    fastq_pairs = re.sub(".*INFO :: ", "", fastq_pairs)
+    fastq_pairs = re.sub(" reads found in each fastq file.*", "", fastq_pairs)
+    fastq_pairs = int(float(fastq_pairs))
+
+    # 2. Number (% of total) of (un)mapped reads
+    tot_mapped = [s for s in log_lines if re.search("mapped with Q ", s)][0]
+    tot_mapped = re.sub(".*Q >= 30 \(", "", tot_mapped)
+    tot_mapped = re.sub("/.*", "", tot_mapped)
+    tot_mapped = int(float(tot_mapped))
+    tot_unmapped = fastq_pairs*2 - tot_mapped
+    pct_mapped = round(tot_mapped/(fastq_pairs*2)*100, 2)
+    pct_unmapped = round(tot_unmapped/(fastq_pairs*2)*100, 2)
+
+    # 3. Number (% of total) of pairs
+    tot_pairs = [s for s in log_lines if re.search("pairs successfully mapped", s)][0]
+    tot_pairs = re.sub(".*INFO :: ", "", tot_pairs)
+    tot_pairs = re.sub(" pairs successfully.*", "", tot_pairs)
+    tot_pairs = int(float(tot_pairs))
+    pct_pairs = round(tot_pairs/fastq_pairs*100, 2)
+
+    # 4. Number (% of total) of filtered pairs
+    filtered_pairs = [s for s in log_lines if re.search("pairs discarded:", s)]
+    if (len(filtered_pairs) > 0): 
+        filtered_pairs = filtered_pairs[0]
+        loops_pairs = re.sub(".*Loops: ", "", filtered_pairs)
+        loops_pairs = re.sub(", Uncuts:.*", "", loops_pairs)
+        loops_pairs = int(float(loops_pairs))
+        uncuts_pairs = re.sub(".*Uncuts: ", "", filtered_pairs)
+        uncuts_pairs = re.sub(", Weirds:.*", "", uncuts_pairs)
+        uncuts_pairs = int(float(uncuts_pairs))
+        abnormal_pairs = re.sub(".*Weirds: ", "", filtered_pairs)
+        abnormal_pairs = int(float(abnormal_pairs))
+        filtered_pairs = re.sub(".*INFO :: ", "", filtered_pairs)
+        filtered_pairs = re.sub(" pairs discarded.*", "", filtered_pairs)
+        filtered_pairs = int(float(filtered_pairs))
+    else: 
+        loops_pairs=0
+        uncuts_pairs=0
+        abnormal_pairs=0
+        filtered_pairs=0
+    pct_filtered = round(filtered_pairs/tot_pairs*100, 2)
+    pct_loops_pairs = round(loops_pairs/tot_pairs*100, 2)
+    pct_uncuts_pairs = round(uncuts_pairs/tot_pairs*100, 2)
+    pct_abnormal_pairs = round(abnormal_pairs/tot_pairs*100, 2)
+
+    # 5. Number (% of total) of PCR dups pairs
+    PCR_pairs = [s for s in log_lines if re.search("PCR duplicates have been filtered", s)]
+    if (len(PCR_pairs) > 0): 
+        PCR_pairs = PCR_pairs[0]
+        PCR_pairs = re.sub(".*have been filtered out \(", "", PCR_pairs)
+        PCR_pairs = re.sub(" / .*", "", PCR_pairs)
+        PCR_pairs = int(float(PCR_pairs))
+    else: 
+        PCR_pairs = 0
+    pct_PCR = round(PCR_pairs/tot_pairs*100, 2)
+
+    # 6. Number (%) of final pairs
+    kept_pairs=tot_pairs-filtered_pairs-PCR_pairs
+    pct_kept = round(kept_pairs/tot_pairs*100, 2)
+
+    # Open the log file and read its contents
+    stats_file_path = join(out_dir, prefix + ".stats.txt")
+    with open(stats_file_path, 'w') as stats_file:
+        stats_file.write("Sample:             {prefix}\n".format(prefix = prefix))
+        stats_file.write("Total read pairs:   {reads}\n".format(reads = "{:,}".format(fastq_pairs)))
+        stats_file.write("Mapped reads:       {tot_mapped} ({pct_mapped}%  of all reads)\n".format(
+                tot_mapped = "{:,}".format(tot_mapped), 
+                pct_mapped = pct_mapped
+            )
+        )
+        stats_file.write("Unmapped reads:     {tot_unmapped} ({pct_unmapped}%  of all reads)\n".format(
+            tot_unmapped = "{:,}".format(tot_unmapped), pct_unmapped = pct_unmapped
+        ))
+        stats_file.write("Recovered contacts: {tot_pairs} ({pct_pairs}%  of all read pairs)\n".format(
+            tot_pairs = "{:,}".format(tot_pairs), pct_pairs = pct_pairs
+        ))
+        stats_file.write("Final contacts:     {kept_pairs} ({pct_kept}% of all contacts)\n".format(
+            kept_pairs = "{:,}".format(kept_pairs), pct_kept = pct_kept
+        ))
+        stats_file.write("  Filtered out:     {filtered_pairs} ({pct_filtered}% of all contacts)\n".format(
+            filtered_pairs = "{:,}".format(filtered_pairs), pct_filtered = pct_filtered
+        ))
+        stats_file.write("    Loops:          {loops_pairs} ({pct_loops_pairs}% of all contacts)\n".format(
+            loops_pairs = "{:,}".format(loops_pairs), pct_loops_pairs = pct_loops_pairs
+        ))
+        stats_file.write("    Uncuts:         {uncuts_pairs} ({pct_uncuts_pairs}% of all contacts)\n".format(
+            uncuts_pairs = "{:,}".format(uncuts_pairs), pct_uncuts_pairs = pct_uncuts_pairs
+        ))
+        stats_file.write("    Weirds:         {abnormal_pairs} ({pct_abnormal_pairs}% of all contacts)\n".format(
+            abnormal_pairs = "{:,}".format(abnormal_pairs), pct_abnormal_pairs = pct_abnormal_pairs
+        ))
+        stats_file.write("  PCR duplicates:   {PCR_pairs} ({pct_PCR}% of all contacts)\n".format(
+            PCR_pairs = "{:,}".format(PCR_pairs), pct_PCR = pct_PCR
+        ))
+
+    stats = {
+        'Total read pairs': fastq_pairs,
+        'Mapped reads': tot_mapped,
+        'Unmapped reads': tot_unmapped,
+        'Recovered contacts': tot_pairs,
+        'Final contacts': kept_pairs,
+        'Filtered out': filtered_pairs,
+        'Loops': loops_pairs,
+        'Uncuts': uncuts_pairs,
+        'Weirds': abnormal_pairs,
+        'PCR duplicates': PCR_pairs
+    }
+    stats_json_path = join(out_dir, prefix + ".stats.json")
+    with open(stats_json_path, 'w') as json_file:
+        json.dump(stats, json_file, indent=4)
+
diff --git a/hicstuff/version.py b/hicstuff/version.py
@@ -1 +1 @@
-__version__ = '3.2.1'
+__version__ = '3.2.2'
diff --git a/setup.py b/setup.py
@@ -30,7 +30,7 @@
 
 MAJOR = 3
 MINOR = 2
-MAINTENANCE = 1
+MAINTENANCE = 2
 VERSION = "{}.{}.{}".format(MAJOR, MINOR, MAINTENANCE)
 
 LICENSE = "BSD-3-Clause"