Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

3.2.2 #80

Merged
merged 5 commits into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM mambaorg/micromamba:latest

LABEL Name=hicstuff Version=3.2.1
LABEL Name=hicstuff Version=3.2.2

COPY --chown=$MAMBA_USER:$MAMBA_USER . ./

Expand Down
2 changes: 1 addition & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# The short X.Y version
version = "3.2"
# The full version, including alpha/beta/rc tags
release = "3.2.1"
release = "3.2.2"


# -- General configuration ---------------------------------------------------
Expand Down
22 changes: 5 additions & 17 deletions hicstuff/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1435,7 +1435,7 @@ def check_is_fasta(in_file):

def check_fastq_entries(in_file):
"""
Check how many reads are in the input fastq. Requires zcat.
Check how many reads are in the input fastq.

Parameters
----------
Expand All @@ -1450,24 +1450,12 @@ def check_fastq_entries(in_file):

with open(in_file, 'rb') as f:
is_gzip = f.read(2) == b'\x1f\x8b'

if is_gzip:
n_lines = sp.run(
"zcat < {f} | wc -l".format(f = in_file),
stdout=sp.PIPE,
stderr=sp.PIPE,
shell = True,
encoding = 'utf-8'
).stdout[:-2].split(" ")[0]
with gzip.open(in_file, "rt") as input_fastq:
n_lines = sum(1 for line in input_fastq)
else:
n_lines = sp.run(
"wc -l {f}".format(f = in_file),
stdout=sp.PIPE,
stderr=sp.PIPE,
shell = True,
encoding = 'utf-8'
).stdout[:-2].split(" ")[0]

with open(in_file, "r") as input_fastq:
n_lines = sum(1 for line in input_fastq)
n_reads = int(n_lines)/4
return n_reads

Expand Down
48 changes: 47 additions & 1 deletion hicstuff/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import hicstuff.io as hio
import hicstuff.distance_law as hcdl
import hicstuff.cutsite as hcc
import hicstuff.stats as hcs
import matplotlib
import pathlib
from hicstuff.version import __version__
Expand Down Expand Up @@ -312,6 +313,9 @@ def filter_pcr_dup(pairs_idx_file, filtered_file):
"%d%% PCR duplicates have been filtered out (%d / %d pairs) "
% (100 * round(filter_count / reads_count, 3), filter_count, reads_count)
)
logger.info(
"%d pairs remaining after removing PCR duplicates", reads_count - filter_count
)


def pairs2cool(pairs_file, cool_file, bins_file, exclude):
Expand Down Expand Up @@ -883,6 +887,7 @@ def _out_file(fname):
pairs_idx = input1

# Perform genome alignment
nreads_input1 = 0
if start_stage == 0:

# Check number of reads in both fastqs
Expand All @@ -892,7 +897,7 @@ def _out_file(fname):
if (nreads_input1 != nreads_input2):
logger.error("Fastq files do not have the same number of reads.")
else:
logger.info("{n} reads found in each fastq file.".format(n = nreads_input1))
logger.info("{n} reads found in each fastq file.".format(n = int(nreads_input1)))

# Define mapping choice (default normal):
if mapping == "normal":
Expand Down Expand Up @@ -1006,6 +1011,26 @@ def _out_file(fname):
)
os.rename(pairs_idx + ".sorted", pairs_idx)

# Count total pairs
tot_pairs = 0
with open(pairs_idx, "r") as file:
for line in file:
if line.startswith('#'):
continue
else:
tot_pairs += 1
if nreads_input1 != 0:
logger.info(
"{0} pairs successfully mapped ({1}%)".format(
tot_pairs, round(100 * tot_pairs / (nreads_input1), 2)
)
)
else:
logger.info(
"{0} pairs successfully mapped".format(tot_pairs)
)

# Filter pairs if requested
if filter_events:
uncut_thr, loop_thr = hcf.get_thresholds(
pairs_idx, plot_events=plot, fig_path=dist_plot, prefix=prefix
Expand Down Expand Up @@ -1067,6 +1092,19 @@ def _out_file(fname):
# Build matrix from pairs.
if mat_fmt == "cool":

# Log which pairs file is being used and how many pairs are listed
pairs_count = 0
with open(use_pairs, "r") as file:
for line in file:
if line.startswith('#'):
continue
else:
pairs_count += 1
logger.info(
"Generating matrix from pairs file %s (%d pairs in the file) ",
use_pairs, pairs_count
)

# Name matrix file in .cool
mat = os.path.splitext(mat)[0] + ".cool"

Expand Down Expand Up @@ -1099,6 +1137,14 @@ def _out_file(fname):
tmp_dir=tmp_dir,
)

# Get stats on the pipeline
try:
logger.info("Fetching mapping and pairing stats")
hcs.get_pipeline_stats(prefix, out_dir, log_file)
except IndexError:
logger.warning("IndexError. Stats not compiled.")
pass

# Move final pairs file to main dir.
p = pathlib.Path(use_pairs).absolute()
pairsf = p.parents[1] / p.name
Expand Down
154 changes: 154 additions & 0 deletions hicstuff/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
from os.path import join
import json

def get_pipeline_stats(prefix, out_dir, log_file):
"""Get stats after pipeline execution.

Parameters
----------
prefix : str
The prefix used to create output files by the pipeline.
out_dir : str
The prefix used to create output files by the pipeline.
log_file : str
Path to hicstuff log file.

Returns
-------
txt file:
A single text file containing stats about hicstuff pipeline execution:
- Number of sequenced pairs from fastqs
- Number (% of total reads) of unmapped reads
- Number (% of total reads) of mapped reads
- Number (% of total reads) of pairs
- Number (% of total pairs) of filtered pairs
- Number (% of total pairs) of loops
- Number (% of total pairs) of uncuts
- Number (% of total pairs) of abnormal (-- and ++)
- Number (% of total pairs) of deduplicated pairs [Number (% of total pairs) of PCR duplicates]
- From all pairs used in contact matrix:
- Number (% of matrix) of cis pairs
- Number (% of matrix) of trans pairs
- Trans ratio
"""

with open(log_file) as file:
log_lines = [line.rstrip() for line in file]

# 1. Number of sequenced pairs from fastqs
fastq_pairs = [s for s in log_lines if re.search("reads found in eajch fastq file.", s)][0]
fastq_pairs = re.sub(".*INFO :: ", "", fastq_pairs)
fastq_pairs = re.sub(" reads found in each fastq file.*", "", fastq_pairs)
fastq_pairs = int(float(fastq_pairs))

# 2. Number (% of total) of (un)mapped reads
tot_mapped = [s for s in log_lines if re.search("mapped with Q ", s)][0]
tot_mapped = re.sub(".*Q >= 30 \(", "", tot_mapped)
tot_mapped = re.sub("/.*", "", tot_mapped)
tot_mapped = int(float(tot_mapped))
tot_unmapped = fastq_pairs*2 - tot_mapped
pct_mapped = round(tot_mapped/(fastq_pairs*2)*100, 2)
pct_unmapped = round(tot_unmapped/(fastq_pairs*2)*100, 2)

# 3. Number (% of total) of pairs
tot_pairs = [s for s in log_lines if re.search("pairs successfully mapped", s)][0]
tot_pairs = re.sub(".*INFO :: ", "", tot_pairs)
tot_pairs = re.sub(" pairs successfully.*", "", tot_pairs)
tot_pairs = int(float(tot_pairs))
pct_pairs = round(tot_pairs/fastq_pairs*100, 2)

# 4. Number (% of total) of filtered pairs
filtered_pairs = [s for s in log_lines if re.search("pairs discarded:", s)]
if (len(filtered_pairs) > 0):
filtered_pairs = filtered_pairs[0]
loops_pairs = re.sub(".*Loops: ", "", filtered_pairs)
loops_pairs = re.sub(", Uncuts:.*", "", loops_pairs)
loops_pairs = int(float(loops_pairs))
uncuts_pairs = re.sub(".*Uncuts: ", "", filtered_pairs)
uncuts_pairs = re.sub(", Weirds:.*", "", uncuts_pairs)
uncuts_pairs = int(float(uncuts_pairs))
abnormal_pairs = re.sub(".*Weirds: ", "", filtered_pairs)
abnormal_pairs = int(float(abnormal_pairs))
filtered_pairs = re.sub(".*INFO :: ", "", filtered_pairs)
filtered_pairs = re.sub(" pairs discarded.*", "", filtered_pairs)
filtered_pairs = int(float(filtered_pairs))
else:
loops_pairs=0
uncuts_pairs=0
abnormal_pairs=0
filtered_pairs=0
pct_filtered = round(filtered_pairs/tot_pairs*100, 2)
pct_loops_pairs = round(loops_pairs/tot_pairs*100, 2)
pct_uncuts_pairs = round(uncuts_pairs/tot_pairs*100, 2)
pct_abnormal_pairs = round(abnormal_pairs/tot_pairs*100, 2)

# 5. Number (% of total) of PCR dups pairs
PCR_pairs = [s for s in log_lines if re.search("PCR duplicates have been filtered", s)]
if (len(PCR_pairs) > 0):
PCR_pairs = PCR_pairs[0]
PCR_pairs = re.sub(".*have been filtered out \(", "", PCR_pairs)
PCR_pairs = re.sub(" / .*", "", PCR_pairs)
PCR_pairs = int(float(PCR_pairs))
else:
PCR_pairs = 0
pct_PCR = round(PCR_pairs/tot_pairs*100, 2)

# 6. Number (%) of final pairs
kept_pairs=tot_pairs-filtered_pairs-PCR_pairs
pct_kept = round(kept_pairs/tot_pairs*100, 2)

# Open the log file and read its contents
stats_file_path = join(out_dir, prefix + ".stats.txt")
with open(stats_file_path, 'w') as stats_file:
stats_file.write("Sample: {prefix}\n".format(prefix = prefix))
stats_file.write("Total read pairs: {reads}\n".format(reads = "{:,}".format(fastq_pairs)))
stats_file.write("Mapped reads: {tot_mapped} ({pct_mapped}% of all reads)\n".format(
tot_mapped = "{:,}".format(tot_mapped),
pct_mapped = pct_mapped
)
)
stats_file.write("Unmapped reads: {tot_unmapped} ({pct_unmapped}% of all reads)\n".format(
tot_unmapped = "{:,}".format(tot_unmapped), pct_unmapped = pct_unmapped
))
stats_file.write("Recovered contacts: {tot_pairs} ({pct_pairs}% of all read pairs)\n".format(
tot_pairs = "{:,}".format(tot_pairs), pct_pairs = pct_pairs
))
stats_file.write("Final contacts: {kept_pairs} ({pct_kept}% of all contacts)\n".format(
kept_pairs = "{:,}".format(kept_pairs), pct_kept = pct_kept
))
stats_file.write(" Filtered out: {filtered_pairs} ({pct_filtered}% of all contacts)\n".format(
filtered_pairs = "{:,}".format(filtered_pairs), pct_filtered = pct_filtered
))
stats_file.write(" Loops: {loops_pairs} ({pct_loops_pairs}% of all contacts)\n".format(
loops_pairs = "{:,}".format(loops_pairs), pct_loops_pairs = pct_loops_pairs
))
stats_file.write(" Uncuts: {uncuts_pairs} ({pct_uncuts_pairs}% of all contacts)\n".format(
uncuts_pairs = "{:,}".format(uncuts_pairs), pct_uncuts_pairs = pct_uncuts_pairs
))
stats_file.write(" Weirds: {abnormal_pairs} ({pct_abnormal_pairs}% of all contacts)\n".format(
abnormal_pairs = "{:,}".format(abnormal_pairs), pct_abnormal_pairs = pct_abnormal_pairs
))
stats_file.write(" PCR duplicates: {PCR_pairs} ({pct_PCR}% of all contacts)\n".format(
PCR_pairs = "{:,}".format(PCR_pairs), pct_PCR = pct_PCR
))

stats = {
'Total read pairs': fastq_pairs,
'Mapped reads': tot_mapped,
'Unmapped reads': tot_unmapped,
'Recovered contacts': tot_pairs,
'Final contacts': kept_pairs,
'Filtered out': filtered_pairs,
'Loops': loops_pairs,
'Uncuts': uncuts_pairs,
'Weirds': abnormal_pairs,
'PCR duplicates': PCR_pairs
}
stats_json_path = join(out_dir, prefix + ".stats.json")
with open(stats_json_path, 'w') as json_file:
json.dump(stats, json_file, indent=4)

2 changes: 1 addition & 1 deletion hicstuff/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '3.2.1'
__version__ = '3.2.2'
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

MAJOR = 3
MINOR = 2
MAINTENANCE = 1
MAINTENANCE = 2
VERSION = "{}.{}.{}".format(MAJOR, MINOR, MAINTENANCE)

LICENSE = "BSD-3-Clause"
Expand Down
Loading