Merge branch 'master' into development

charite · Jan 11, 2016 · d3dcb34 · d3dcb34
2 parents d2cd5b5 + 8de2025
commit d3dcb34
Show file tree

Hide file tree

Showing 2,017 changed files with 837,479 additions and 0 deletions.
diff --git a/bin/flexcat b/bin/flexcat
diff --git a/bin/nexcat b/bin/nexcat
diff --git a/data/adapters.fa b/data/adapters.fa
@@ -0,0 +1,10 @@
+>adapter1:3':
+AGATCGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
+>adapter2:3':
+GATCGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
+>adapter3:3':
+ATCGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
+>adapter4:3':
+TCGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
+>adapter5:3':
+CGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
diff --git a/data/adapters_best.fa b/data/adapters_best.fa
@@ -0,0 +1,6 @@
+>adapter1.2:3':
+AGATCGGAAGAGCACACGTCTGGATCCACGACGCTCTTCC
+>adapter1.2:3':
+AGATCGGAAGAGCACA
+>adapter1.2:5':
+TGTGCTCTTCCGATCT
diff --git a/data/barcodes.fa b/data/barcodes.fa
@@ -0,0 +1,2 @@
+>matched_barcode
+CTGA
diff --git a/scripts/bam_indexer.py b/scripts/bam_indexer.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+
+import sys
+import os.path
+import subprocess
+import argparse
+
+#print 'Number of arguments:', len(sys.argv), 'arguments.'
+#print 'Argument List:', str(sys.argv)
+
+parser = argparse.ArgumentParser(description="Sort and Index BAM files")
+parser.add_argument('--output', type=str)
+parser.add_argument('input_file')
+
+results, leftovers = parser.parse_known_args()
+
+print results.input_file
+#print results.output
+
+if results.output is not None:
+ outFilenamePrefix, file_extension = os.path.splitext(results.output)
+ outFilenamePrefix = os.path.dirname(results.input_file)+ "/" +outFilenamePrefix;
+else:
+ outFilenamePrefix, file_extension = os.path.splitext(results.input_file)
+ outFilenamePrefix += "_sorted"
+#print "output file: " + outFilenamePrefix + ".bam"
+
+inputFile = results.input_file
+
+inFilenamePrefix, inFileExtension = os.path.splitext(results.input_file)
+if inFileExtension == ".sam":
+ args = ("samtools", "view", "-Sb", results.input_file)
+ print "converting to BAM..."
+ #print args
+ f = open(inFilenamePrefix + ".bam", "w")
+ popen = subprocess.Popen(args, stdout=f)
+ popen.wait()
+ f.close()
+ if popen.returncode != 0:
+  print "error"
+  sys.exit()
+ print inFilenamePrefix + ".bam created"
+ inputFile = inFilenamePrefix + ".bam"
+
+args = ("samtools", "sort", results.input_file, outFilenamePrefix)
+print "sorting..."
+#print args
+popen = subprocess.Popen(args, stdout=subprocess.PIPE)
+popen.wait()
+output = popen.stdout.read()
+print output
+if popen.returncode != 0:
+ print "error"
+ sys.exit()
+print outFilenamePrefix + ".bam created"
+print "indexing..."
+args = ("samtools", "index", outFilenamePrefix+".bam")
+popen = subprocess.Popen(args, stdout=subprocess.PIPE)
+popen.wait()
+output = popen.stdout.read()
+print output
+if popen.returncode != 0:
+ print "error"
+ sys.exit()
+print outFilenamePrefix+".bai created"
diff --git a/scripts/graph_duplication_rate.r b/scripts/graph_duplication_rate.r
@@ -0,0 +1,59 @@
+args <- commandArgs(trailingOnly = TRUE)
+print(args)
+
+printFile <- function(filename, output_filename)
+{
+  len <- 50;
+
+  duplication_rate = read.table(filename, header=T, sep="\t") 
+
+  plot_colors = c("black","green","red")
+  line_types = c("solid","solid","solid");
+
+  #cairo_pdf(output_filename,width=9,height=7.5)
+png(output_filename, width=800, height=600)
+
+  par(mar=c(5, 4, 4, 6) + 0.1)
+  max_y = max(duplication_rate)
+  plot(duplication_rate$rate[c(1:len)], duplication_rate$non_unique[c(1:len)]+duplication_rate$unique[c(1:len)], log="y",type="l", lty=line_types[1], col=plot_colors[1], axes=FALSE, ann=FALSE)
+  axis(1, at=2*(1:len/2), labels=2*(1:len/2))
+
+  aty <- axTicks(2,log=TRUE)
+  labels <- sapply(log(aty,10),function(i)
+    as.expression(bquote(10^.(i))))
+  axis(2, at=axTicks(2,log=TRUE), labels=format(aty, scientific=TRUE))
+
+  lines(duplication_rate$rate[c(1:len)],duplication_rate$unique[c(1:len)],type="l", lty=line_types[2], col=plot_colors[2])
+  title(xlab= "Duplication Rate")
+  title(ylab= "Number of Reads")
+  box()
+
+
+  pcr_artifacts_percentage <-  duplication_rate$non_unique / (duplication_rate$unique + duplication_rate$non_unique)
+  pcr_artifacts_absolute <-  duplication_rate$non_unique
+
+
+  max_y_difference = max(pcr_artifacts_percentage)
+  par(new=TRUE)
+  plot(duplication_rate$rate[c(1:len)],pcr_artifacts_percentage[c(1:len)], type="l", lty=line_types[3], col=plot_colors[3],ann=F, axes=F)
+  mtext("Percentage PCR artifacts",side=4,col="black",line=4) 
+  usr<-par("usr")
+
+  axis(4, at=axTicks(4), labels=format(axTicks(4), scientific=FALSE),las=1)
+
+  title(main="Reads per Duplication Rate", col.main="black", font.main=4)
+
+  print(usr)
+
+  text((usr[1]+usr[2])/1.6,usr[4]*0.99, paste("mapped reads: ",as.character(sum(duplication_rate$unique)+sum(duplication_rate$non_unique)),"\n",
+                     "unique reads: ",as.character(sum(duplication_rate$unique)),"\n",
+                     "non-unique reads: ",as.character(sum(duplication_rate$non_unique)),"\n",
+                     "PCR artifacts rate: ",as.character(signif(sum(duplication_rate$non_unique)/(sum(duplication_rate$non_unique)+sum(duplication_rate$unique))),3)), adj=c(1,1))
+
+  legend("topright", c("total duplicates","unique duplicates", "PCR artifacts rate"), cex=0.8, col=plot_colors, lty=line_types)
+  dev.off()
+
+}
+
+printFile(args[1], args[2])
+
diff --git a/scripts/preprocess.py b/scripts/preprocess.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python
+# Author: Benjamin Menkuec
+# Copyright 2015 Benjamin Menkuec
+# License: LGPL
+
+import sys
+import os.path
+import subprocess
+import argparse
+import platform
+import multiprocessing
+
+# this function calls a subroutine that will call samtools to 
+# sort the bam file and then build an index of it
+def indexBamFile(filename, script_path):
+	args = ("python", script_path + "/bam_indexer.py", filename)
+	print args
+	print "Creating indexed bam file..."
+	popen = subprocess.Popen(args, stdout=subprocess.PIPE)
+	popen.wait()
+	output = popen.stdout.read()
+	if results.verbose == True:
+		print output
+	if popen.returncode != 0:
+		print "error"
+		sys.exit()
+	return;
+
+script_path = os.path.dirname(os.path.realpath(__file__))
+dataDir = os.getcwd() + "/"
+
+parser = argparse.ArgumentParser(description="Preprocess fastq files and do mapping")
+parser.add_argument('--adapters', type=str, default = "/../data/adapters.fa")
+parser.add_argument('--barcodes', type=str, default = "/../data/barcodes.fa")
+parser.add_argument('--flexcat_er', type=str, default = "0.2")
+parser.add_argument('--flexcat_ol', type=str, default = "4")
+parser.add_argument('--flexcat_fm', type=str, default = "19")
+parser.add_argument('--flexcat_ml', type=str, default = "19")
+parser.add_argument('--flexcat_oh', type=str, default = "0")
+parser.add_argument('--flexcat_times', type=str, default = "1")
+
+parser.add_argument('--exo', action='store_true')
+parser.add_argument('--clean', action='store_true')
+parser.add_argument('--overwrite', action='store_true')
+parser.add_argument('--verbose', action='store_true')
+parser.add_argument('--bowtie_location', nargs='?', default = "")
+parser.add_argument('--num_threads', nargs='?', default = 
+	str(multiprocessing.cpu_count()))
+parser.add_argument('input_file')
+parser.add_argument('--output_dir', type=str)
+parser.add_argument('--filter_chromosomes', type=str, default="(.*)[H|U|M|_]+(.*)")
+parser.add_argument('--random_split', action='store_true')
+parser.add_argument('genome', type=str)
+
+results, leftovers = parser.parse_known_args()
+
+flexcatAdapterFilename = (os.path.dirname(os.path.realpath(__file__)) + 
+	results.adapters)
+flexcatBarcodeFilename = (os.path.dirname(os.path.realpath(__file__)) + 
+	results.barcodes)
+
+print "Reads: " + results.input_file
+print "Genome: " + results.genome
+
+genomeFilename = results.genome
+bowtieLocation = results.bowtie_location
+if len(bowtieLocation) > 0:
+    bowtieLocation = bowtieLocation + "/"
+
+if(platform.system() == "Windows" and results.bowtie_location == ""):
+ print "Bowtie location is required under windows"
+ sys.exit()
+
+inputFile = os.path.abspath(results.input_file)
+
+temp, inFileExtension = inputFile.split(os.extsep, 1)
+# The output file of flexcat can not be a zipped fastq,
+# because bowtie can not handle it.
+# Therefore, remove .gz ending if its a fastq.gz file
+inFileExtension, temp = os.path.splitext(inFileExtension) 
+inFileExtension = "." + inFileExtension
+inFilenamePrefixWithoutPath = os.path.basename(inputFile)
+inFilenamePrefixWithoutPath, temp = inFilenamePrefixWithoutPath.split(os.extsep, 1)
+
+# set the output filename of flexcat
+if results.output_dir is not None:
+    outputDir = os.path.abspath(results.output_dir) 
+    head, tail = os.path.split(results.output_dir)
+    if len(tail) > 0: 
+        inFilenamePrefixWithoutPath = tail;
+    outputDir = outputDir + "/"
+else:
+    outputDir = inFilenamePrefixWithoutPath
+flexcatOutputFilename = (outputDir + "/" + inFilenamePrefixWithoutPath + 
+	inFileExtension)
+
+# if the exo option is set, there wont be any fixed barcode matching.
+# therefore the output file of flexcat does not have a postfix
+if results.exo:
+ bowtieInputFilename = (outputDir + "/" + inFilenamePrefixWithoutPath + 
+	inFileExtension)
+else:
+ bowtieInputFilename = (outputDir + "/" + inFilenamePrefixWithoutPath + 
+	"_matched_barcode" + inFileExtension)
+bowtieOutputFilename = outputDir + "/" + inFilenamePrefixWithoutPath + ".sam"
+
+# platform independant way of calling flexcat
+flexcat_path = script_path+"/../bin/flexcat"
+if(platform.system() == "Windows"):
+	flexcat_path += ".exe"
+
+if results.exo:
+ args = (flexcat_path, results.input_file, "-tt", "-t", "-ss", "-st", "-app", 
+	"-tnum", results.num_threads, "-times", results.flexcat_times, "-er", 
+	results.flexcat_er, "-ol", results.flexcat_ol, "-oh", results.flexcat_oh, 
+	"-fm", results.flexcat_fm, "-ml", results.flexcat_ml, "-a", 
+	flexcatAdapterFilename,"-o", flexcatOutputFilename)
+else:
+ args = (flexcat_path, results.input_file, "-tl", "5", "-tt", "-t", "-ss", "-st", 
+	"-app", "-tnum", results.num_threads, "-times", results.flexcat_times, "-er", 
+	results.flexcat_er, "-ol", results.flexcat_ol, "-oh", results.flexcat_oh, 
+	"-fm", results.flexcat_fm, "-ml", results.flexcat_ml, "-b", 
+	flexcatBarcodeFilename, "-a", flexcatAdapterFilename,"-o", flexcatOutputFilename)
+if not os.path.exists(outputDir):
+ os.makedirs(outputDir)
+if (os.path.isfile(bowtieInputFilename) == False or results.overwrite == True):
+    print "Filtering pre-mapping barcodes and trimming adapters..."
+    if results.verbose == True:
+        popen = subprocess.Popen(args + tuple(leftovers))
+    else:
+        popen = subprocess.Popen(args + tuple(leftovers), stdout=subprocess.PIPE)
+    popen.wait()
+    if popen.returncode != 0:
+     print "error"
+     sys.exit()
+    if results.verbose == True:
+        print flexcatOutputFilename + " created"
+
+head, tail = os.path.split(genomeFilename)
+genomeIndex, file_extension = os.path.splitext(tail)
+
+# check if bowtie index already exists
+# if yes, skip building the index
+genomeIndexFile = os.path.dirname(genomeFilename) + "/" + genomeIndex;
+if (os.path.isfile(genomeIndexFile + ".1.ebwt") == False):
+ if(platform.system() == "Linux" or platform.system() == "Linux2"):
+  args = (bowtieLocation + "bowtie-build", "-o", "1", genomeFilename, 
+	genomeIndexFile)
+ else:
+  args = ("python",  bowtieLocation + "bowtie-build", "-o", "1", 
+	genomeFilename, genomeIndexFile)
+ popen = subprocess.Popen(args, stdout=subprocess.PIPE)
+ popen.wait()
+ output = popen.stdout.read()
+ if results.verbose == True:
+    print output
+ if popen.returncode != 0:
+  print "error"
+  sys.exit() 
+
+# call bowtie if mapped file does not exist or overwrite is true
+if (os.path.isfile(bowtieOutputFilename) == False or results.overwrite == True):
+    args = (bowtieLocation + "bowtie", "-S", "-p", results.num_threads, 
+		"--chunkmbs", "512", "-k", "1", "-m", "1", "-v", "2", "--strata", "--best", 
+		genomeIndexFile, bowtieInputFilename, bowtieOutputFilename)
+    if(platform.system() != "Linux" and platform.system() != "Linux2"):
+     args = ("python",) + args
+    print "Mapping reads..."
+    popen = subprocess.Popen(args, stdout=subprocess.PIPE)
+    popen.wait()
+    output = popen.stdout.read()
+    if results.verbose == True:
+        print output
+    if popen.returncode != 0:
+     print "error"
+     sys.exit()
+
+# nexus-pre
+nexusOutputFilename = (outputDir + "/" + inFilenamePrefixWithoutPath + 
+	"_filtered.bam")
+if results.random_split == True:
+	nexusOutputFilenameSplit1 = (outputDir + "/" + inFilenamePrefixWithoutPath + 
+	"_filtered_split1.bam")
+	nexusOutputFilenameSplit2 = (outputDir + "/" + inFilenamePrefixWithoutPath + 
+	"_filtered_split2.bam")
+
+# platform independant way of calling nexus-pre
+nexcat_path = script_path+"/../bin/nexcat"
+if(platform.system() == "Windows"):
+	nexcat_path += ".exe"
+
+# call nexcat if overwrite is true or output files dont exist	
+if (os.path.isfile(nexusOutputFilename) == False or 
+	(results.random_split == True and 
+		(os.path.isfile(nexusOutputFilenameSplit1) == False or 
+			os.path.isfile(nexusOutputFilenameSplit2) == False)) or
+		results.overwrite == True):
+    args = (nexcat_path, bowtieOutputFilename,  "-fc", results.filter_chromosomes)
+    if results.random_split == True:
+		args += ("-rs",)
+    print "Filtering post-mapping barcodes..."
+    popen = subprocess.Popen(args, stdout=subprocess.PIPE)
+    popen.wait()
+    output = popen.stdout.read()
+    if results.verbose == True:
+        print output
+    if popen.returncode != 0:
+        print "error"
+        sys.exit()
+
+# special option for binding characteristic analysis
+indexBamFile(nexusOutputFilename, script_path)			
+if results.random_split == True:
+	indexBamFile(nexusOutputFilenameSplit2, script_path)		
+	indexBamFile(nexusOutputFilenameSplit1, script_path)		
+
+# cleanup
+if results.clean:
+    print "deleting intermediate files..."
+    os.remove(bowtieOutputFilename)
+    os.remove(nexusOutputFilename)
+    if results.exo:
+        os.remove(flexcatOutputFilename)
+    else:
+        os.remove(outputDir + "/" + inFilenamePrefixWithoutPath + 
+			"_matched_barcode" + inFileExtension)
+        os.remove(outputDir + "/" + inFilenamePrefixWithoutPath + 
+			"_unidentified" + inFileExtension)