-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use watchPath to enable streaming basecalling Closes CW-1728 See merge request epi2melabs/workflows/wf-basecalling!14
- Loading branch information
Showing
10 changed files
with
435 additions
and
68 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,22 +19,34 @@ docker-run: | |
# which options to apply as part of the rules block below | ||
# NOTE There is a slightly cleaner way to define this matrix to include | ||
# the variables, but it is broken when using long strings! See CW-756 | ||
tags: | ||
- grid | ||
- shell | ||
parallel: | ||
matrix: | ||
- MATRIX_NAME: [ | ||
"dorado", | ||
] | ||
"dorado", "watch_path", "no_reference" | ||
] | ||
rules: | ||
- when: never | ||
# NOTE As we're overriding the rules block for the included docker-run | ||
# we must redefine this CI_COMMIT_BRANCH rule to prevent docker-run | ||
# being incorrectly scheduled for "detached merge request pipelines" etc. | ||
#- if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template") | ||
# when: never | ||
#- if: $MATRIX_NAME == "dorado" | ||
# variables: | ||
# NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/demo_data.tar.gz && tar -xzvf demo_data.tar.gz && cat demo_data/VERSION && rm demo_data.tar.gz" | ||
# NF_WORKFLOW_OPTS: "--input demo_data/input --ref demo_data/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg '/home/epi2melabs/[email protected]'" | ||
# we must redefine this CI_COMMIT_BRANCH rule to prevent docker-run | ||
# being incorrectly scheduled for "detached merge request pipelines" etc. | ||
- if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template") | ||
when: never | ||
- if: $MATRIX_NAME == "dorado" | ||
variables: | ||
NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/demo_data.tar.gz && tar -xzvf demo_data.tar.gz && cat demo_data/VERSION && rm demo_data.tar.gz" | ||
NF_WORKFLOW_OPTS: "--input demo_data/input --ref demo_data/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg [email protected] --basecaller_chunk_size 1" | ||
NF_IGNORE_PROCESSES: "stopCondition" | ||
- if: $MATRIX_NAME == "watch_path" | ||
variables: | ||
NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/demo_data.tar.gz && tar -xzvf demo_data.tar.gz && cat demo_data/VERSION && rm demo_data.tar.gz" | ||
NF_WORKFLOW_OPTS: "--input demo_data/input --ref demo_data/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta --basecaller_cfg [email protected] --watch_path --read_limit 3000 --basecaller_chunk_size 1" | ||
- if: $MATRIX_NAME == "no_reference" | ||
variables: | ||
NF_BEFORE_SCRIPT: "wget -qO demo_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-basecalling/demo_data.tar.gz && tar -xzvf demo_data.tar.gz && cat demo_data/VERSION && rm demo_data.tar.gz" | ||
NF_WORKFLOW_OPTS: "--input demo_data/input --basecaller_cfg [email protected] --basecaller_chunk_size 1" | ||
NF_IGNORE_PROCESSES: "cram_cache,stopCondition" | ||
|
||
aws-run: | ||
rules: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#!/usr/bin/env python | ||
"""Combine two JSONS, sum values by matching json keys.""" | ||
|
||
import argparse | ||
import json | ||
import os | ||
|
||
|
||
def add_dicts(d1, d2): | ||
"""Extend json, sum values.""" | ||
def sum_a(v1, v2): | ||
if v2 is None: | ||
return v1 | ||
try: | ||
if isinstance(v1 + v2, int): | ||
return v1 + v2 | ||
elif isinstance(v1 + v2, str): | ||
return v1 | ||
except TypeError: | ||
return add_dicts(v1, v2) | ||
result = d2.copy() | ||
result.update({k: sum_a(v, d2.get(k)) for k, v in d1.items()}) | ||
return result | ||
|
||
|
||
def main(args): | ||
"""Run the entry point.""" | ||
if os.stat(args.state).st_size == 0: | ||
state = {} | ||
else: | ||
with open(args.state) as json_file: | ||
state = json.load(json_file) | ||
with open(args.new_file) as json_file: | ||
new_file = json.load(json_file) | ||
combined = add_dicts(state, new_file) | ||
with open(args.output, "w") as outfile: | ||
json.dump(combined, outfile) | ||
|
||
|
||
def argparser(): | ||
"""Create argument parser.""" | ||
parser = argparse.ArgumentParser( | ||
"add_jsons", | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter, | ||
add_help=False) | ||
parser.add_argument("new_file") | ||
parser.add_argument("state") | ||
parser.add_argument("output") | ||
return parser | ||
|
||
|
||
if __name__ == "__main__": | ||
args = argparser().parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#!/usr/bin/env python | ||
"""Histogram-json.""" | ||
|
||
import argparse | ||
import json | ||
import sys | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def histogram_counts(data, dmin=0, bin_width=100): | ||
"""Histogram bins and counts.""" | ||
bins = np.arange(dmin, max(data) + bin_width, bin_width) | ||
counts, _ = np.histogram(data, bins=bins) | ||
# Note that there can be small differences with/without batch_size=1. | ||
# https://numpy.org/doc/stable/reference/generated/numpy.histogram.html | ||
# bins from =[1, 2, 3, 4] => First bin=[1, 2), last bin=[3, 4]. | ||
# i.e. in batch_size=1, the count will be in the last interval (both edges included) | ||
# With more sequences, there can be different intervals and edge value can be moved | ||
# to the next consecutive interval. | ||
return bins.tolist(), counts.tolist() | ||
|
||
|
||
def get_stats(seq_summary): | ||
"""Get Stats Json.""" | ||
stats_json = { | ||
"total_reads": len(seq_summary)} | ||
if len(seq_summary) >= 1: | ||
len_data = seq_summary['read_length'] | ||
len_bins, len_counts = histogram_counts( | ||
len_data, dmin=0, bin_width=50) | ||
stats_json["len"] = dict(list(zip(len_bins, len_counts))) | ||
|
||
qual_data = seq_summary['mean_quality'] | ||
qual_bins, qual_counts = histogram_counts( | ||
qual_data, dmin=0, bin_width=0.2) | ||
stats_json["qual"] = dict(list(zip(qual_bins, qual_counts))) | ||
else: | ||
sys.stderr.write("WARNING: summary file was empty.\n") | ||
stats_json["len"] = dict() | ||
stats_json["qual"] = dict() | ||
return stats_json | ||
|
||
|
||
def main(args): | ||
"""Run the entry point.""" | ||
df = pd.read_csv( | ||
args.input, sep="\t", | ||
usecols=['read_length', 'mean_quality'], | ||
dtype={'read_length': int, 'mean_quality': float}) | ||
final = {args.sample_id: get_stats(df)} | ||
with open(args.output, 'w') as fp: | ||
json.dump(final, fp) | ||
|
||
|
||
def argparser(): | ||
"""Argument parser for entrypoint.""" | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"input", help="Read summary file.") | ||
parser.add_argument( | ||
"output", help="Output summary JSON.") | ||
parser.add_argument( | ||
"--sample_id", help="Sample name.") | ||
return parser | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparser() | ||
main(parser.parse_args()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#!/usr/bin/env python | ||
"""Create tables for the report.""" | ||
from ezcharts.plots.distribution import histplot | ||
import pandas as pd | ||
|
||
# PLOTS | ||
|
||
# The SeqSummary from ezcharts.components.fastcat cannot be used. | ||
# It groups data into bins, but from the real time analysis output | ||
# the input data is already grouped into bins. | ||
# Use weights of histplot for y axis. | ||
|
||
|
||
def read_quality_plot(seq_summary, min_qual=4, max_qual=30, title='Read quality'): | ||
"""Create read quality summary plot.""" | ||
df = pd.DataFrame.from_dict(seq_summary['qual'].items()) | ||
df.columns = ['mean_quality', 'counts'] | ||
df['mean_quality'] = df['mean_quality'].astype('float') | ||
plt = histplot( | ||
data=df['mean_quality'], | ||
bins=len(df), | ||
weights=list(df['counts']) | ||
) | ||
plt.title = dict(text=title) | ||
plt.xAxis.name = 'Quality score' | ||
plt.xAxis.min, plt.xAxis.max = min_qual, max_qual | ||
plt.yAxis.name = 'Number of reads' | ||
return plt | ||
|
||
|
||
def read_length_plot(seq_summary, title='Read length'): | ||
"""Create a read length plot.""" | ||
df = pd.DataFrame.from_dict(seq_summary['len'].items()) | ||
df.columns = ['read_length', 'counts'] | ||
df['read_length'] = df['read_length'].astype('uint64') | ||
df['read_length'] = df['read_length'] / 1000 | ||
plt = histplot( | ||
data=df['read_length'], | ||
bins=len(df), | ||
weights=list(df['counts'])) | ||
plt.title = dict(text=title) | ||
plt.xAxis.name = 'Read length / kb' | ||
plt.yAxis.name = 'Number of reads' | ||
return plt |
Oops, something went wrong.