Skip to content

Commit

Permalink
Merge branch 'master' into development
Browse files Browse the repository at this point in the history
  • Loading branch information
hansenp committed Jan 11, 2016
2 parents d2cd5b5 + 8de2025 commit d3dcb34
Show file tree
Hide file tree
Showing 2,017 changed files with 837,479 additions and 0 deletions.
Binary file added bin/flexcat
Binary file not shown.
Binary file added bin/nexcat
Binary file not shown.
10 changes: 10 additions & 0 deletions data/adapters.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>adapter1:3':
AGATCGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
>adapter2:3':
GATCGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
>adapter3:3':
ATCGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
>adapter4:3':
TCGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
>adapter5:3':
CGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
6 changes: 6 additions & 0 deletions data/adapters_best.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>adapter1.2:3':
AGATCGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
>adapter1.2:3':
AGATCGGAAGAGCACA
>adapter1.2:5':
TGTGCTCTTCCGATCT
2 changes: 2 additions & 0 deletions data/barcodes.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>matched_barcode
CTGA
65 changes: 65 additions & 0 deletions scripts/bam_indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python

import sys
import os.path
import subprocess
import argparse

#print 'Number of arguments:', len(sys.argv), 'arguments.'
#print 'Argument List:', str(sys.argv)

parser = argparse.ArgumentParser(description="Sort and Index BAM files")
parser.add_argument('--output', type=str)
parser.add_argument('input_file')

results, leftovers = parser.parse_known_args()

print results.input_file
#print results.output

if results.output is not None:
outFilenamePrefix, file_extension = os.path.splitext(results.output)
outFilenamePrefix = os.path.dirname(results.input_file)+ "/" +outFilenamePrefix;
else:
outFilenamePrefix, file_extension = os.path.splitext(results.input_file)
outFilenamePrefix += "_sorted"
#print "output file: " + outFilenamePrefix + ".bam"

inputFile = results.input_file

inFilenamePrefix, inFileExtension = os.path.splitext(results.input_file)
if inFileExtension == ".sam":
args = ("samtools", "view", "-Sb", results.input_file)
print "converting to BAM..."
#print args
f = open(inFilenamePrefix + ".bam", "w")
popen = subprocess.Popen(args, stdout=f)
popen.wait()
f.close()
if popen.returncode != 0:
print "error"
sys.exit()
print inFilenamePrefix + ".bam created"
inputFile = inFilenamePrefix + ".bam"

args = ("samtools", "sort", results.input_file, outFilenamePrefix)
print "sorting..."
#print args
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
print output
if popen.returncode != 0:
print "error"
sys.exit()
print outFilenamePrefix + ".bam created"
print "indexing..."
args = ("samtools", "index", outFilenamePrefix+".bam")
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
print output
if popen.returncode != 0:
print "error"
sys.exit()
print outFilenamePrefix+".bai created"
59 changes: 59 additions & 0 deletions scripts/graph_duplication_rate.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
args <- commandArgs(trailingOnly = TRUE)
print(args)

printFile <- function(filename, output_filename)
{
len <- 50;

duplication_rate = read.table(filename, header=T, sep="\t")

plot_colors = c("black","green","red")
line_types = c("solid","solid","solid");

#cairo_pdf(output_filename,width=9,height=7.5)
png(output_filename, width=800, height=600)

par(mar=c(5, 4, 4, 6) + 0.1)
max_y = max(duplication_rate)
plot(duplication_rate$rate[c(1:len)], duplication_rate$non_unique[c(1:len)]+duplication_rate$unique[c(1:len)], log="y",type="l", lty=line_types[1], col=plot_colors[1], axes=FALSE, ann=FALSE)
axis(1, at=2*(1:len/2), labels=2*(1:len/2))

aty <- axTicks(2,log=TRUE)
labels <- sapply(log(aty,10),function(i)
as.expression(bquote(10^.(i))))
axis(2, at=axTicks(2,log=TRUE), labels=format(aty, scientific=TRUE))

lines(duplication_rate$rate[c(1:len)],duplication_rate$unique[c(1:len)],type="l", lty=line_types[2], col=plot_colors[2])
title(xlab= "Duplication Rate")
title(ylab= "Number of Reads")
box()


pcr_artifacts_percentage <- duplication_rate$non_unique / (duplication_rate$unique + duplication_rate$non_unique)
pcr_artifacts_absolute <- duplication_rate$non_unique


max_y_difference = max(pcr_artifacts_percentage)
par(new=TRUE)
plot(duplication_rate$rate[c(1:len)],pcr_artifacts_percentage[c(1:len)], type="l", lty=line_types[3], col=plot_colors[3],ann=F, axes=F)
mtext("Percentage PCR artifacts",side=4,col="black",line=4)
usr<-par("usr")

axis(4, at=axTicks(4), labels=format(axTicks(4), scientific=FALSE),las=1)

title(main="Reads per Duplication Rate", col.main="black", font.main=4)

print(usr)

text((usr[1]+usr[2])/1.6,usr[4]*0.99, paste("mapped reads: ",as.character(sum(duplication_rate$unique)+sum(duplication_rate$non_unique)),"\n",
"unique reads: ",as.character(sum(duplication_rate$unique)),"\n",
"non-unique reads: ",as.character(sum(duplication_rate$non_unique)),"\n",
"PCR artifacts rate: ",as.character(signif(sum(duplication_rate$non_unique)/(sum(duplication_rate$non_unique)+sum(duplication_rate$unique))),3)), adj=c(1,1))

legend("topright", c("total duplicates","unique duplicates", "PCR artifacts rate"), cex=0.8, col=plot_colors, lty=line_types)
dev.off()

}

printFile(args[1], args[2])

228 changes: 228 additions & 0 deletions scripts/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
#!/usr/bin/env python
# Author: Benjamin Menkuec
# Copyright 2015 Benjamin Menkuec
# License: LGPL

import sys
import os.path
import subprocess
import argparse
import platform
import multiprocessing

# this function calls a subroutine that will call samtools to
# sort the bam file and then build an index of it
def indexBamFile(filename, script_path):
args = ("python", script_path + "/bam_indexer.py", filename)
print args
print "Creating indexed bam file..."
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
if results.verbose == True:
print output
if popen.returncode != 0:
print "error"
sys.exit()
return;

script_path = os.path.dirname(os.path.realpath(__file__))
dataDir = os.getcwd() + "/"

parser = argparse.ArgumentParser(description="Preprocess fastq files and do mapping")
parser.add_argument('--adapters', type=str, default = "/../data/adapters.fa")
parser.add_argument('--barcodes', type=str, default = "/../data/barcodes.fa")
parser.add_argument('--flexcat_er', type=str, default = "0.2")
parser.add_argument('--flexcat_ol', type=str, default = "4")
parser.add_argument('--flexcat_fm', type=str, default = "19")
parser.add_argument('--flexcat_ml', type=str, default = "19")
parser.add_argument('--flexcat_oh', type=str, default = "0")
parser.add_argument('--flexcat_times', type=str, default = "1")

parser.add_argument('--exo', action='store_true')
parser.add_argument('--clean', action='store_true')
parser.add_argument('--overwrite', action='store_true')
parser.add_argument('--verbose', action='store_true')
parser.add_argument('--bowtie_location', nargs='?', default = "")
parser.add_argument('--num_threads', nargs='?', default =
str(multiprocessing.cpu_count()))
parser.add_argument('input_file')
parser.add_argument('--output_dir', type=str)
parser.add_argument('--filter_chromosomes', type=str, default="(.*)[H|U|M|_]+(.*)")
parser.add_argument('--random_split', action='store_true')
parser.add_argument('genome', type=str)

results, leftovers = parser.parse_known_args()

flexcatAdapterFilename = (os.path.dirname(os.path.realpath(__file__)) +
results.adapters)
flexcatBarcodeFilename = (os.path.dirname(os.path.realpath(__file__)) +
results.barcodes)

print "Reads: " + results.input_file
print "Genome: " + results.genome

genomeFilename = results.genome
bowtieLocation = results.bowtie_location
if len(bowtieLocation) > 0:
bowtieLocation = bowtieLocation + "/"

if(platform.system() == "Windows" and results.bowtie_location == ""):
print "Bowtie location is required under windows"
sys.exit()

inputFile = os.path.abspath(results.input_file)

temp, inFileExtension = inputFile.split(os.extsep, 1)
# The output file of flexcat can not be a zipped fastq,
# because bowtie can not handle it.
# Therefore, remove .gz ending if its a fastq.gz file
inFileExtension, temp = os.path.splitext(inFileExtension)
inFileExtension = "." + inFileExtension
inFilenamePrefixWithoutPath = os.path.basename(inputFile)
inFilenamePrefixWithoutPath, temp = inFilenamePrefixWithoutPath.split(os.extsep, 1)

# set the output filename of flexcat
if results.output_dir is not None:
outputDir = os.path.abspath(results.output_dir)
head, tail = os.path.split(results.output_dir)
if len(tail) > 0:
inFilenamePrefixWithoutPath = tail;
outputDir = outputDir + "/"
else:
outputDir = inFilenamePrefixWithoutPath
flexcatOutputFilename = (outputDir + "/" + inFilenamePrefixWithoutPath +
inFileExtension)

# if the exo option is set, there wont be any fixed barcode matching.
# therefore the output file of flexcat does not have a postfix
if results.exo:
bowtieInputFilename = (outputDir + "/" + inFilenamePrefixWithoutPath +
inFileExtension)
else:
bowtieInputFilename = (outputDir + "/" + inFilenamePrefixWithoutPath +
"_matched_barcode" + inFileExtension)
bowtieOutputFilename = outputDir + "/" + inFilenamePrefixWithoutPath + ".sam"

# platform independant way of calling flexcat
flexcat_path = script_path+"/../bin/flexcat"
if(platform.system() == "Windows"):
flexcat_path += ".exe"

if results.exo:
args = (flexcat_path, results.input_file, "-tt", "-t", "-ss", "-st", "-app",
"-tnum", results.num_threads, "-times", results.flexcat_times, "-er",
results.flexcat_er, "-ol", results.flexcat_ol, "-oh", results.flexcat_oh,
"-fm", results.flexcat_fm, "-ml", results.flexcat_ml, "-a",
flexcatAdapterFilename,"-o", flexcatOutputFilename)
else:
args = (flexcat_path, results.input_file, "-tl", "5", "-tt", "-t", "-ss", "-st",
"-app", "-tnum", results.num_threads, "-times", results.flexcat_times, "-er",
results.flexcat_er, "-ol", results.flexcat_ol, "-oh", results.flexcat_oh,
"-fm", results.flexcat_fm, "-ml", results.flexcat_ml, "-b",
flexcatBarcodeFilename, "-a", flexcatAdapterFilename,"-o", flexcatOutputFilename)
if not os.path.exists(outputDir):
os.makedirs(outputDir)
if (os.path.isfile(bowtieInputFilename) == False or results.overwrite == True):
print "Filtering pre-mapping barcodes and trimming adapters..."
if results.verbose == True:
popen = subprocess.Popen(args + tuple(leftovers))
else:
popen = subprocess.Popen(args + tuple(leftovers), stdout=subprocess.PIPE)
popen.wait()
if popen.returncode != 0:
print "error"
sys.exit()
if results.verbose == True:
print flexcatOutputFilename + " created"

head, tail = os.path.split(genomeFilename)
genomeIndex, file_extension = os.path.splitext(tail)

# check if bowtie index already exists
# if yes, skip building the index
genomeIndexFile = os.path.dirname(genomeFilename) + "/" + genomeIndex;
if (os.path.isfile(genomeIndexFile + ".1.ebwt") == False):
if(platform.system() == "Linux" or platform.system() == "Linux2"):
args = (bowtieLocation + "bowtie-build", "-o", "1", genomeFilename,
genomeIndexFile)
else:
args = ("python", bowtieLocation + "bowtie-build", "-o", "1",
genomeFilename, genomeIndexFile)
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
if results.verbose == True:
print output
if popen.returncode != 0:
print "error"
sys.exit()

# call bowtie if mapped file does not exist or overwrite is true
if (os.path.isfile(bowtieOutputFilename) == False or results.overwrite == True):
args = (bowtieLocation + "bowtie", "-S", "-p", results.num_threads,
"--chunkmbs", "512", "-k", "1", "-m", "1", "-v", "2", "--strata", "--best",
genomeIndexFile, bowtieInputFilename, bowtieOutputFilename)
if(platform.system() != "Linux" and platform.system() != "Linux2"):
args = ("python",) + args
print "Mapping reads..."
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
if results.verbose == True:
print output
if popen.returncode != 0:
print "error"
sys.exit()

# nexus-pre
nexusOutputFilename = (outputDir + "/" + inFilenamePrefixWithoutPath +
"_filtered.bam")
if results.random_split == True:
nexusOutputFilenameSplit1 = (outputDir + "/" + inFilenamePrefixWithoutPath +
"_filtered_split1.bam")
nexusOutputFilenameSplit2 = (outputDir + "/" + inFilenamePrefixWithoutPath +
"_filtered_split2.bam")

# platform independant way of calling nexus-pre
nexcat_path = script_path+"/../bin/nexcat"
if(platform.system() == "Windows"):
nexcat_path += ".exe"

# call nexcat if overwrite is true or output files dont exist
if (os.path.isfile(nexusOutputFilename) == False or
(results.random_split == True and
(os.path.isfile(nexusOutputFilenameSplit1) == False or
os.path.isfile(nexusOutputFilenameSplit2) == False)) or
results.overwrite == True):
args = (nexcat_path, bowtieOutputFilename, "-fc", results.filter_chromosomes)
if results.random_split == True:
args += ("-rs",)
print "Filtering post-mapping barcodes..."
popen = subprocess.Popen(args, stdout=subprocess.PIPE)
popen.wait()
output = popen.stdout.read()
if results.verbose == True:
print output
if popen.returncode != 0:
print "error"
sys.exit()

# special option for binding characteristic analysis
indexBamFile(nexusOutputFilename, script_path)
if results.random_split == True:
indexBamFile(nexusOutputFilenameSplit2, script_path)
indexBamFile(nexusOutputFilenameSplit1, script_path)

# cleanup
if results.clean:
print "deleting intermediate files..."
os.remove(bowtieOutputFilename)
os.remove(nexusOutputFilename)
if results.exo:
os.remove(flexcatOutputFilename)
else:
os.remove(outputDir + "/" + inFilenamePrefixWithoutPath +
"_matched_barcode" + inFileExtension)
os.remove(outputDir + "/" + inFilenamePrefixWithoutPath +
"_unidentified" + inFileExtension)
Loading

0 comments on commit d3dcb34

Please sign in to comment.