Skip to content

Commit

Permalink
reassembly read splitting fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
German Uritskiy committed Jun 11, 2018
1 parent 91a4e81 commit 669ddcf
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 31 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ metaWRAP v=0.9 (In development)
delete bins that have a size 0 after dereplication
fixed shebang in summarize_checkm.py
allowed pplacer to use multiple threads, drastically speeding up the module
module no longer clears output DIR before starting. safer this way
Binning
added metabat1 binning option
fixed checkm tmp directory handling when using the --run-checkm option
fixed perl5 library handling when running maxbin2
added diagnostic info for perl libraries
added use of MaxBin v2.2.5 - this fixes some issues with newer gcc
checkm now uses pplacer multi-threading
Blobology
added figure output folders for easier viewing
Kraken
Expand All @@ -27,10 +29,12 @@ metaWRAP v=0.9 (In development)
Assembly
use metaSPAdes v3.12.0
Reassemble_bins
split up the read alignment and read splitting steps
read alignemnt and splitting is now done simultaneously, and no longer requires a lot of RAM
signifficantly sped up the read-splitting function and prevented freezes
added and fixed --parallel option bug
use SPAdes v3.12.0
checkm now uses multi-threading for pplacer
the user can now change the strict/permissive read mapping SNP cut off parameters


metaWRAP v=0.8 (March 2018)
Expand Down
6 changes: 3 additions & 3 deletions bin/metawrap-modules/bin_refinement.sh
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ else
p_threads=$threads
fi

comm "There is $mem RAM and $threads available, and each pplacer thread uses ~4GB, so I will allocate $p_threads threads for pplacer"
comm "There is $mem RAM and $threads threads available, and each pplacer thread uses ~4GB, so I will use $p_threads threads for pplacer"

########################################################################################################
######################## BEGIN REFINEMENT PIPELINE! ########################
Expand All @@ -145,8 +145,8 @@ if [ ! -d $out ]; then
mkdir $out
if [ ! -d $out ]; then error "cannot make $out"; fi
else
echo "Warning: $out already exists. Cleaning..."
rm -r ${out}/*
warning "Warning: $out already exists. It is HIGHLY recommended that you clear this directory to prevent any conflicts... Attempting to clean"
#rm -r ${out}/*
fi

n_binnings=0
Expand Down
18 changes: 15 additions & 3 deletions bin/metawrap-modules/binning.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ help_message () {
echo " -a STR metagenomic assembly file"
echo " -o STR output directory"
echo " -t INT number of threads (default=1)"
echo " -m INT amount of RAM available (default=4)"
echo ""
echo " --metabat2 bin contigs with metaBAT2"
echo " --metabat1 bin contigs with the original metaBAT"
Expand All @@ -39,8 +40,18 @@ warning () { ${SOFT}/print_comment.py "$1" "*"; }
announcement () { ${SOFT}/print_comment.py "$1" "#"; }
run_checkm () {
comm "Running CheckM on ${1} bins"

# determine --pplacer_threads count. It is either the max thread count or RAM/4, whichever is higher
ram_max=$(($mem / 4))
if (( $ram_max < $threads )); then
p_threads=$ram_max
else
p_threads=$threads
fi
comm "There is $mem RAM and $threads threads available, and each pplacer thread uses ~4GB, so I will use $p_threads threads for pplacer"

mkdir ${1}.tmp
checkm lineage_wf -x fa ${1} ${1}.checkm -t $threads --tmpdir ${1}.tmp
checkm lineage_wf -x fa ${1} ${1}.checkm -t $threads --tmpdir ${1}.tmp --pplacer_threads $p_threads
if [[ ! -s ${1}.checkm/storage/bin_stats_ext.tsv ]]; then error "Something went wrong with running CheckM. Exiting..."; fi
rm -r ${1}.tmp
${SOFT}/summarize_checkm.py ${1}.checkm/storage/bin_stats_ext.tsv ${1##*/}\
Expand All @@ -61,19 +72,20 @@ config_file=$(which config-metawrap)
source $config_file

# default params
threads=1; out=false; ASSEMBLY=false
threads=1; mem=4; out=false; ASSEMBLY=false
# long options defaults
metabat1=false; metabat2=false; maxbin2=false; concoct=false; checkm=false

# load in params
OPTS=`getopt -o ht:o:a: --long help,metabat1,metabat2,maxbin2,concoct,run-checkm -- "$@"`
OPTS=`getopt -o ht:m:o:a: --long help,metabat1,metabat2,maxbin2,concoct,run-checkm -- "$@"`
# make sure the params are entered correctly
if [ $? -ne 0 ]; then help_message; exit 1; fi

# loop through input params
while true; do
case "$1" in
-t) threads=$2; shift 2;;
-m) mem=$2; shift 2;;
-o) out=$2; shift 2;;
-a) ASSEMBLY=$2; shift 2;;
-h | --help) help_message; exit 1; shift 1;;
Expand Down
35 changes: 23 additions & 12 deletions bin/metawrap-modules/reassemble_bins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,18 @@ help_message () {
echo ""
echo "Options:"
echo ""
echo " -b STR folder with metagenomic bins"
echo " -o STR output directory"
echo " -1 STR forward reads to use for reassembly"
echo " -2 STR reverse reads to use for reassembly"
echo ""
echo " -t INT number of threads (default=1)"
echo " -m INT memory in GB (default=4)"
echo " -b STR folder with metagenomic bins"
echo " -c INT minimum desired bin completion %"
echo " -x INT maximum desired bin contamination %"
echo ""
echo " --strict-cut-off maximum allowed SNPs for strict read mapping (default=2)"
echo " --permissive-cut-off maximum allowed SNPs for permissive read mapping (default=5)"
echo " --skip-checkm dont run CheckM to assess bins"
echo " --parallel run spades reassembly in parallel, but only using 1 thread per bin"
echo "";}
Expand Down Expand Up @@ -67,10 +70,11 @@ source $config_file
threads=1; mem=4; comp=70; cont=10;
bins=None; f_reads=None; r_reads=None; out=None
# long options defaults
strict_max=2; permissive_max=5
run_checkm=true
run_parallel=false
# load in params
OPTS=`getopt -o ht:m:o:x:c:b:1:2: --long help,parallel,skip-checkm -- "$@"`
OPTS=`getopt -o ht:m:o:x:c:b:1:2: --long help,parallel,skip-checkm,strict-cut-off,permissive-cut-off -- "$@"`
# make sure the params are entered correctly
if [ $? -ne 0 ]; then help_message; exit 1; fi

Expand All @@ -86,14 +90,15 @@ while true; do
-1) f_reads=$2; shift 2;;
-2) r_reads=$2; shift 2;;
-h | --help) help_message; exit 0; shift 1;;
--strict-cut-off) strict_max=$2; shift 2;;
--permissive-cut-off) permissive_max=$2; shift 2;;
--skip-checkm) run_checkm=false; shift 1;;
--parallel) run_parallel=true; shift 1;;
--) help_message; exit 1; shift; break ;;
*) break;;
esac
done


########################################################################################################
######################## MAKING SURE EVERYTHING IS SET UP ########################
########################################################################################################
Expand All @@ -117,7 +122,7 @@ comm "setting up output folder and copything over bins..."
if [ ! -d $out ]; then
mkdir $out;
else
echo "Warning: $out already exists."
warning "Warning: $out already exists!"
fi

if [ -d ${out}/original_bins ]; then rm -r ${out}/original_bins; fi
Expand All @@ -140,12 +145,10 @@ if [ -d ${out}/reads_for_reassembly ]; then rm -r ${out}/reads_for_reassembly; f
mkdir ${out}/reads_for_reassembly

comm "Aligning all reads back to entire assembly and splitting reads into individual fastq files based on their bin membership"
bwa mem -t $threads ${out}/binned_assembly/assembly.fa $f_reads $r_reads > ${out}/binned_assembly/alignment.sam

cat ${out}/binned_assembly/alignment.sam | ${SOFT}/filter_reads_for_bin_reassembly.py ${out}/original_bins ${out}/reads_for_reassembly

bwa mem -t $threads ${out}/binned_assembly/assembly.fa $f_reads $r_reads \
| ${SOFT}/filter_reads_for_bin_reassembly.py ${out}/original_bins ${out}/reads_for_reassembly $strict_max $permissive_max
if [[ $? -ne 0 ]]; then error "Something went wrong with pulling out reads for reassembly..."; fi
#rm ${out}/binned_assembly/alignment.sam


########################################################################################################
######################## REASSEMBLING BINS WITH SPADES ########################
Expand Down Expand Up @@ -224,7 +227,6 @@ if [[ $(ls ${out}/reassembled_bins/ | wc -l) -lt 1 ]]; then
error "None of the bins were successfully reassembled. ${out}/reassembled_bins/ is empty."
else
comm "Looks like the reassemblies went well. Now to see if they made the bins better or worse..."
mv ${out}/reassembly ${out}/reassembly_files
fi


Expand All @@ -235,6 +237,15 @@ if [ "$run_checkm" = true ]; then
########################################################################################################
announcement "RUN CHECKM ON REASSEMBLED BINS"

# determine --pplacer_threads count. It is either the max thread count or RAM/4, whichever is higher
ram_max=$(($mem / 4))
if (( $ram_max < $threads )); then
p_threads=$ram_max
else
p_threads=$threads
fi
comm "There is $mem RAM and $threads threads available, and each pplacer thread uses ~4GB, so I will use $p_threads threads for pplacer"

# copy over original bins
for base in $( ls ${out}/original_bins/ | grep "\.fa$" ); do
i=${out}/original_bins/$base
Expand All @@ -243,7 +254,7 @@ if [ "$run_checkm" = true ]; then

comm "Running CheckM on best bins (reassembled and original)"
mkdir ${out}/tmp
checkm lineage_wf -x fa ${out}/reassembled_bins ${out}/reassembled_bins.checkm -t $threads --tmpdir ${out}/tmp
checkm lineage_wf -x fa ${out}/reassembled_bins ${out}/reassembled_bins.checkm -t $threads --tmpdir ${out}/tmp --pplacer_threads $p_threads
if [[ ! -s ${out}/reassembled_bins.checkm/storage/bin_stats_ext.tsv ]]; then error "Something went wrong with running CheckM. Exiting..."; fi
${SOFT}/summarize_checkm.py ${out}/reassembled_bins.checkm/storage/bin_stats_ext.tsv | (read -r; printf "%s\n" "$REPLY"; sort) > ${out}/reassembled_bins.stats
if [[ $? -ne 0 ]]; then error "Cannot make checkm summary file. Exiting."; fi
Expand Down Expand Up @@ -285,7 +296,7 @@ if [ "$run_checkm" = true ]; then

comm "Re-running CheckM on the best reasembled bins."
mkdir ${out}/tmp
checkm lineage_wf -x fa ${out}/reassembled_bins ${out}/reassembled_bins.checkm -t $threads --tmpdir ${out}/tmp
checkm lineage_wf -x fa ${out}/reassembled_bins ${out}/reassembled_bins.checkm -t $threads --tmpdir ${out}/tmp --pplacer_threads $p_threads
if [[ ! -s ${out}/reassembled_bins.checkm/storage/bin_stats_ext.tsv ]]; then error "Something went wrong with running CheckM. Exiting..."; fi
rm -r ${out}/tmp
comm "Finalizing CheckM stats..."
Expand Down
Binary file added bin/metawrap-scripts/.print_comment.py.swp
Binary file not shown.
47 changes: 35 additions & 12 deletions bin/metawrap-scripts/filter_reads_for_bin_reassembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,15 @@
#usage:
# bwa mem -a assembly.fa reads_1.fastq reads_2.fastq | ./filter_reads_for_bin_reassembly.py original_bin_folder reads_1.fastq reads_2.fastq output_dir
import sys, os
cache = 100
strict_snp_cutoff = int(sys.argv[3])
permissive_snp_cutoff = int(sys.argv[4])

complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'a':'t', 't':'a', 'c':'g', 'g':'c', 'N':'N', 'n':'n'}
def rev_comp(seq):
rev_comp=""
for n in seq:
rev_comp+=complement[n]
return rev_comp[::-1]

# load bin contigs
print "loading contig to bin mappings..."
Expand All @@ -20,17 +28,17 @@
print "Parsing sam file and writing reads to appropriate files depending what bin they alligned to..."
files={}
opened_bins={}
read_type="F"
for line in sys.stdin:
if line[0]=="@": continue
if read_type=="F":
read_type="R"
cut = line.strip().split("\t")
binary_flag = bin(int(cut[1]))

if binary_flag[-7]=="1":
F_line=line
continue
else:
read_type="F"
elif binary_flag[-8]=="1":
R_line=line

# get fields for forward and reverse reads
F_cut = F_line.strip().split("\t")
R_cut = R_line.strip().split("\t")
Expand Down Expand Up @@ -73,16 +81,29 @@
if field.startswith("NM:i:"):
cumulative_mismatches += int(field.split(":")[-1])
break


# determine alignment type from bitwise FLAG
F_binary_flag = bin(int(F_cut[1]))
R_binary_flag = bin(int(R_cut[1]))


# if the reads are reversed, fix them
if F_binary_flag[-5]=='1':
F_cut[9] = rev_comp(F_cut[9])
F_cut[10] = F_cut[10][::-1]
if R_binary_flag[-5]=='1':
R_cut[9] = rev_comp(R_cut[9])
R_cut[10] = R_cut[10][::-1]

# strict assembly
if cumulative_mismatches<2:
if cumulative_mismatches<strict_snp_cutoff:
files[sys.argv[2]+"/"+bin_name+".strict_1.fastq"].write('@' + F_cut[0] + "/1" + "\n" + F_cut[9] + "\n+\n" + F_cut[10] + "\n")
files[sys.argv[2]+"/"+bin_name+".strict_2.fastq"].write('@' + R_cut[0] + "/1" + "\n" + R_cut[9] + "\n+\n" + R_cut[10] + "\n")
files[sys.argv[2]+"/"+bin_name+".strict_2.fastq"].write('@' + R_cut[0] + "/2" + "\n" + R_cut[9] + "\n+\n" + R_cut[10] + "\n")

# permissive assembly
if cumulative_mismatches<4:
if cumulative_mismatches<permissive_snp_cutoff:
files[sys.argv[2]+"/"+bin_name+".permissive_1.fastq"].write('@' + F_cut[0] + "/1" + "\n" + F_cut[9] + "\n+\n" + F_cut[10] + "\n")
files[sys.argv[2]+"/"+bin_name+".permissive_2.fastq"].write('@' + F_cut[0] + "/1" + "\n" + F_cut[9] + "\n+\n" + F_cut[10] + "\n")
files[sys.argv[2]+"/"+bin_name+".permissive_2.fastq"].write('@' + R_cut[0] + "/2" + "\n" + R_cut[9] + "\n+\n" + R_cut[10] + "\n")


print "closing files"
Expand All @@ -91,3 +112,5 @@


print "Finished splitting reads!"


0 comments on commit 669ddcf

Please sign in to comment.