From 3b7aac61a039644df83677771ef678ad812215b6 Mon Sep 17 00:00:00 2001
From: zj-zhang <zzj.zju@gmail.com>
Date: Tue, 26 Feb 2019 23:54:38 -0800
Subject: [PATCH 1/9] test new preprocess

---
 CLAM/preprocessor.py | 44 ++++++++++++++++++++++----------------------
 setup.py             | 17 ++++++++++-------
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/CLAM/preprocessor.py b/CLAM/preprocessor.py
index 1f94f2a..420eb4c 100755
--- a/CLAM/preprocessor.py
+++ b/CLAM/preprocessor.py
@@ -26,7 +26,7 @@
 import pysam
 import numpy as np
 from collections import defaultdict
-from tqdm import tqdm
+#from tqdm import tqdm
 import logging
 import datetime
 import bisect
@@ -99,10 +99,6 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth
 	mbam=pysam.Samfile(mbam_fn, 'wb', template=in_bam)
 	mread_set = set()
 	
-	# do not omit sequences if to filter max_tags
-	if max_tags>0:
-		omit_detail=False
-	
 	# splitting unique and multi- reads
 	# and add the read taggers we need
 	if not \
@@ -121,24 +117,28 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth
 			if read_tag==-1:
 				continue
 			read.tags += [('RT', read_tag)] ## add the tag
-			
-			## omit the details in read sequence and quality
-			## recommended for larger bam because this
-			## can save some memory/storage for large bams
-			if omit_detail:
-				read.query_sequence = '*'
-				read.query_qualities = '0'
+			tagged_read = pysam.AlignedSegment()
+			tagged_read.query_name = read.query_name
+			tagged_read.query_sequence = 'N'
+			tagged_read.flag = read.flag
+			tagged_read.reference_id = read.reference_id
+			tagged_read.reference_start = read_tag - 1  # 0-based leftmost coordinate
+			tagged_read.mapping_quality = read.mapping_quality
+			tagged_read.cigar = ((0,1),)
+			tagged_read.template_length = read.template_length
+			tagged_read.query_qualities = pysam.qualitystring_to_array("<")
+			tagged_read.tags = read.tags
 			
 			if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1):
-				try:
-					if read.opt("NH") < max_hits:
-						mbam.write(read)
-						mread_set.add(read.qname)
-				except KeyError:
-					#print read
-					raise Exception('%s: missing NH tag when is_secondary=%s'%(read.qname,read.is_secondary))
+				#try:
+				if read.opt("NH") < max_hits:
+					mbam.write(tagged_read)
+					mread_set.add(read.qname)
+				#except KeyError:
+				#	#print read
+				#	raise Exception('%s: missing NH tag when is_secondary=%s'%(read.qname,read.is_secondary))
 			else:
-				ubam.write(read)
+				ubam.write(tagged_read)
 				unique_counter += 1
 		
 		ubam.close()
@@ -146,9 +146,9 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth
 		
 		# sorting
 		pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_ubam_fn), '-o', sorted_ubam_fn, ubam_fn)
-		os.remove(ubam_fn)
+		#os.remove(ubam_fn)
 		pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_mbam_fn), '-o', sorted_mbam_fn, mbam_fn)
-		os.remove(mbam_fn)
+		#os.remove(mbam_fn)
 		pysam.index(sorted_ubam_fn)
 		pysam.index(sorted_mbam_fn)
 		
diff --git a/setup.py b/setup.py
index 0040a0e..c73348e 100755
--- a/setup.py
+++ b/setup.py
@@ -5,13 +5,16 @@
 
 def main():
 	setup(name='CLAM',
-		  version='1.1.3',
-		  description='CLIP-seq Analysis of Multi-mapped reads',
-		  author='Zijun Zhang',
-		  author_email='zj.z@ucla.edu',
-		  url='https://github.com/Xinglab/CLAM',
-		  packages=['CLAM', 'CLAM.stats'],
-		  scripts=['bin/CLAM']
+		version='1.1.4',
+		description='CLIP-seq Analysis of Multi-mapped reads',
+		author='Zijun Zhang',
+		author_email='zj.z@ucla.edu',
+		url='https://github.com/Xinglab/CLAM',
+		packages=['CLAM', 'CLAM.stats'],
+		scripts=['bin/CLAM'],
+		install_requires=[
+			'pysam>0.12,<0.2',
+			'numpy']
 		 )
 	return
 

From 1580b48532a6deaeff6ea9fb8120479eeeae45c9 Mon Sep 17 00:00:00 2001
From: zj-zhang <zzj.zju@gmail.com>
Date: Wed, 27 Feb 2019 01:31:39 -0800
Subject: [PATCH 2/9] add strandness (un-tested)

---
 CLAM/preprocessor.py | 10 ++++++++--
 CLAM/realigner.py    | 14 +++++++++-----
 bin/CLAM             | 15 ++++++++++++---
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/CLAM/preprocessor.py b/CLAM/preprocessor.py
index 420eb4c..f0b98e4 100755
--- a/CLAM/preprocessor.py
+++ b/CLAM/preprocessor.py
@@ -71,7 +71,7 @@ def read_tagger_collection(alignment, method='median', **kwargs):
 
 
 
-def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_method, omit_detail=False):
+def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_method, strandness):
 	"""Pre-processing function for cleaning up the input bam file.
 	Args:
 	Returns:
@@ -128,6 +128,10 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth
 			tagged_read.template_length = read.template_length
 			tagged_read.query_qualities = pysam.qualitystring_to_array("<")
 			tagged_read.tags = read.tags
+
+			# add strandness check
+			if strandness != "none":
+				tagged_read.is_reverse = read.is_reverse ^ strandness!="same"
 			
 			if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1):
 				#try:
@@ -271,13 +275,15 @@ def parser(args):
 		max_hits = args.max_hits
 		## Note: if specified max_tags, need pre-sorted bam
 		max_tags = args.max_tags
+		strandness = args.strandness
 		
 		#logger = logging.getLogger('CLAM.Preprocessor')
 		logger.info('start')
 		logger.info('run info: %s'%(' '.join(sys.argv)))
 		
 		filter_bam_multihits(in_bam, max_hits=max_hits, max_tags=max_tags, out_dir=out_dir, 
-			read_tagger_method=tag_method)
+			read_tagger_method=tag_method,
+			strandness=strandness)
 		
 		logger.info('end')
 	except KeyboardInterrupt():
diff --git a/CLAM/realigner.py b/CLAM/realigner.py
index 3f93c36..456b2c1 100755
--- a/CLAM/realigner.py
+++ b/CLAM/realigner.py
@@ -382,8 +382,9 @@ def get_genomic_clusters(mbam, winsize=50, unstranded=False):
 
 
 def realigner(in_bam, out_dir, max_hits=100, max_tags=-1, read_tagger_method='median', 
-		winsize=50, unstranded=False, retag=False):
+		winsize=50, unstranded=False, retag=False, strandness="same"):
 	"""The main entry for CLAM-realigner.
+
 	Args:
 		in_bam (str): filepath for input bam
 		out_dir (str): filepath for CLAM output folder
@@ -394,6 +395,9 @@ def realigner(in_bam, out_dir, max_hits=100, max_tags=-1, read_tagger_method='me
 		winsize (int): window size 
 		unstranded (bool): ignore alignment strand info if turned on
 		retag (bool): force to call `preprocessor` to process `in_bam` if turned on
+		strandness (str): specifies if the expected read alignment strand is `same` with 
+			transcript strand, or `opposite`, or `none` i.e. unstranded
+	
 	Returns:
 		None
 	"""
@@ -410,7 +414,7 @@ def realigner(in_bam, out_dir, max_hits=100, max_tags=-1, read_tagger_method='me
 			os.path.isfile(os.path.join(out_dir,'multi.sorted.bam')) \
 			) :
 		filter_bam_multihits(in_bam, max_tags=max_tags, max_hits=max_hits, out_dir=out_dir, read_tagger_method=read_tagger_method, 
-			omit_detail=False)
+			strandness=strandness)
 	else:
 		logger.info("found existing bams; skipped tagging.")
 
@@ -503,14 +507,14 @@ def parser(args):
 		max_tags = args.max_tags
 		retag = args.retag
 		winsize = args.winsize
-		unstranded = args.unstranded
+		strandness = args.strandness
+		unstranded = strandness == "none"
 		
-		#logger = logging.getLogger('CLAM.Realigner')
 		logger.info('start')
 		logger.info('run info: %s'%(' '.join(sys.argv)))
 		
 		realigner(in_bam, out_dir, max_hits=max_hits, max_tags=max_tags, read_tagger_method=tag_method, 
-			winsize=winsize, unstranded=unstranded, retag=retag)
+			winsize=winsize, unstranded=unstranded, retag=retag, strandness=strandness)
 		
 		logger.info('end')
 	except KeyboardInterrupt():
diff --git a/bin/CLAM b/bin/CLAM
index a7d0f7a..07254f2 100755
--- a/bin/CLAM
+++ b/bin/CLAM
@@ -148,7 +148,12 @@ def add_preprocessor_parser( subparsers ):
 	
 	ag_prep.add_argument("--max-tags", dest="max_tags", type=int, default=-1,
 		help="The maximum identical tags at given location; default: -1, no filter")
-	
+
+	# strandness
+	ag_prep.add_argument("--strandness", dest="strandness", type=str, 
+		default="same", choices=['same', 'opposite', 'none'],
+		help="The expected read strandness with transcription direction: same, opposite, or none(i.e. unstranded); default: same")
+
 	return
 
 
@@ -180,8 +185,12 @@ def add_realigner_parser( subparsers ):
 	ag_realigner.add_argument("--winsize", dest="winsize", type=int, default=50,
 		help="Local window size for em computations; default: 50")
 
-	ag_realigner.add_argument("--unstranded", dest="unstranded", default=False, action="store_true",
-		help="Unstranded alignments if turned on")
+	#ag_realigner.add_argument("--unstranded", dest="unstranded", default=False, action="store_true",
+	#	help="Unstranded alignments if turned on")
+
+	ag_realigner.add_argument("--strandness", dest="strandness", type=str, 
+		default="same", choices=['same', 'opposite', 'none'],
+		help="The expected read strandness with transcription direction: same, opposite, or none(i.e. unstranded); default: same")
 	
 	return
 

From 7253af4dec5d1e824e4377e5aad584789705326f Mon Sep 17 00:00:00 2001
From: zj-zhang <zzj.zju@gmail.com>
Date: Wed, 27 Feb 2019 01:37:31 -0800
Subject: [PATCH 3/9] fix bit op

---
 CLAM/preprocessor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CLAM/preprocessor.py b/CLAM/preprocessor.py
index f0b98e4..077cf57 100755
--- a/CLAM/preprocessor.py
+++ b/CLAM/preprocessor.py
@@ -131,7 +131,7 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth
 
 			# add strandness check
 			if strandness != "none":
-				tagged_read.is_reverse = read.is_reverse ^ strandness!="same"
+				tagged_read.is_reverse = (read.is_reverse) ^ (strandness!="same")
 			
 			if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1):
 				#try:

From b1fa120d685f3e99641e4afa4682f7477df47129 Mon Sep 17 00:00:00 2001
From: zj-zhang <zzj.zju@gmail.com>
Date: Wed, 27 Feb 2019 01:40:29 -0800
Subject: [PATCH 4/9] remove unsorted files

---
 CLAM/preprocessor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CLAM/preprocessor.py b/CLAM/preprocessor.py
index 077cf57..7e40638 100755
--- a/CLAM/preprocessor.py
+++ b/CLAM/preprocessor.py
@@ -150,9 +150,9 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth
 		
 		# sorting
 		pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_ubam_fn), '-o', sorted_ubam_fn, ubam_fn)
-		#os.remove(ubam_fn)
+		os.remove(ubam_fn)
 		pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_mbam_fn), '-o', sorted_mbam_fn, mbam_fn)
-		#os.remove(mbam_fn)
+		os.remove(mbam_fn)
 		pysam.index(sorted_ubam_fn)
 		pysam.index(sorted_mbam_fn)
 		

From e87c52af388b922d680959c87f0af2fa6e02419d Mon Sep 17 00:00:00 2001
From: zj-zhang <zj.z@ucla.edu>
Date: Sat, 1 Jun 2019 18:49:00 -0400
Subject: [PATCH 5/9] add norm-lib

---
 CLAM/config.py                 |   8 +++
 CLAM/peakcaller.py             | 114 +++++++++++++++++++++++++++------
 CLAM/permutation_peakcaller.py |  14 ++--
 CLAM/preprocessor.py           |   7 +-
 CLAM/realigner.py              |   7 +-
 bin/CLAM                       |  10 +--
 check/compare_realign.py       |  46 +++++++++++++
 requirements.txt               |   4 +-
 setup.py                       |   4 +-
 9 files changed, 173 insertions(+), 41 deletions(-)
 create mode 100644 CLAM/config.py
 create mode 100644 check/compare_realign.py

diff --git a/CLAM/config.py b/CLAM/config.py
new file mode 100644
index 0000000..dcc32a5
--- /dev/null
+++ b/CLAM/config.py
@@ -0,0 +1,8 @@
+# -*- coding: UTF-8 -*-
+
+""" General Version and other info
+"""
+
+__version__ = '1.2.0-beta'
+__author__ = 'Zijun Zhang'
+__email__ = 'zj.z@ucla.edu'
\ No newline at end of file
diff --git a/CLAM/peakcaller.py b/CLAM/peakcaller.py
index ff5e009..8c0680c 100755
--- a/CLAM/peakcaller.py
+++ b/CLAM/peakcaller.py
@@ -25,9 +25,8 @@
 Tested under python 2.7.3
 """
 
-__author__ = 'Zijun Zhang'
-__version__ = '1.1.3'
-__email__ = 'zj.z@ucla.edu'
+from . import config
+__version__ = config.__version__
 
 import os
 import sys
@@ -128,7 +127,8 @@ def	bin_interval_counts(interval, binsize=50):
 	return bins
 
 
-def test_bin_negbinom(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh'):
+def test_bin_negbinom(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh', 
+	norm_lib=False, tot_count_dict=None):
 	"""DOCSTRING
 	Args
 	Returns
@@ -184,20 +184,33 @@ def _neg_loglik_constrain(par, data):
 	binsignal = np.empty(intv_counter, dtype='S50')
 	alpha_ip_vec = np.empty(intv_bin_ip.shape[0])
 	alpha_con_vec = np.empty(intv_bin_con.shape[0])
-	ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip)
-	con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con)
-	if any(ip_sum==0) or any(con_sum==0):
-		return None, None, None
+	if norm_lib and tot_count_dict is not None:
+		try:
+			ip_sum = np.array(tot_count_dict['ubam.ip']) + np.array(tot_count_dict['mbam.ip'] )
+		except KeyError:
+			ip_sum = np.array(tot_count_dict['ubam.ip'])
+		try:
+			con_sum = np.array(tot_count_dict['ubam.con']) + np.array(tot_count_dict['mbam.con'])
+		except KeyError:
+			con_sum = np.array(tot_count_dict['ubam.con'])
+	else:
+		ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip)
+		con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con)
+	
+	## TODO: this avoids the error "DivisionByZero" when any of the IP/con is zero..
+	## comment out because add pseudo-count.. ZZJ 2019.5.31
+	#if any(ip_sum==0) or any(con_sum==0):
+	#	return None, None, None
 	
 	# compute the dispersion parameters
 	min_alpha = 0.001 ## small alpha reduces to Poisson
 	max_alpha = 0.01   ## large alpha costs loss of power
 	if with_control:
 		for i in range(intv_bin_con.shape[0]):
-			height = ztnb_em.collapse_data(np.floor(intv_bin_con[i,]))
-			height[0] = 0
 			try:
 				ll, mu, alpha = ztnb_em.EM_estim_params(height, max_iter=10, verbose=False)
+				height = ztnb_em.collapse_data(np.floor(intv_bin_con[i,]))
+				height[0] = 0
 			except:   ## means truncated loglik=0, not enough data
 				alpha = max_alpha
 			alpha = max_alpha if alpha>max_alpha else alpha
@@ -239,7 +252,7 @@ def _neg_loglik_constrain(par, data):
 		this_ip = intv_bin_ip[:, i]
 		others_ip = ip_sum - this_ip
 		this_con = intv_bin_con[:, i]
-		others_con = con_sum - this_con
+		others_con = con_sum - intv_bin_con[:, i]
 		if np.sum(this_ip) == 0:
 			binsignal[i], binscore[i] = 'nan', np.nan
 			continue
@@ -280,13 +293,15 @@ def _neg_loglik_constrain(par, data):
 			binsignal[i] = ','.join([str(int(x)) if x==int(x) else str(x) for x in this_ip])
 	
 	# correcting for multiple-testing
+	if np.sum(np.isnan(binscore))==len(binscore):
+		return None, None, None
 	adj = multipletests(binscore[~ np.isnan(binscore)], alpha=0.05, method=correction_method)
 	binscore_adj = np.copy(binscore)
 	binscore_adj[ ~ np.isnan(binscore) ] = adj[1]
 	return binsignal, binscore_adj, binscore
 
 
-def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsize=50, unstranded=False, qval_cutoff=0.05, fold_change=[2.], min_clip_cov=0, pooling=False):
+def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsize=50, unstranded=False, qval_cutoff=0.05, fold_change=[2.], min_clip_cov=0, pooling=False, norm_lib=False, tot_count_dict=None):
 	"""DOCSTRING
 	Args
 	Returns
@@ -309,7 +324,7 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz
 	ip_sum = np.apply_along_axis(np.sum, 1, interval_ip)
 	valid_ip_sample = np.where(ip_sum > min_clip_cov)[0]
 	if len(valid_ip_sample)==0:
-		#print "no reads"
+		#print("no reads")
 		return ''
 	interval_ip = interval_ip[valid_ip_sample,:]
 		
@@ -343,7 +358,9 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz
 	intv_bin_con = bin_interval_counts(interval_con, binsize=binsize)
 	
 	# perform statistical test
-	signal_val, binscore_adj, binscore = test_bin_negbinom(intv_bin_ip, intv_bin_con, with_control=with_control)
+	signal_val, binscore_adj, binscore = test_bin_negbinom(
+		intv_bin_ip, intv_bin_con, with_control=with_control,
+		norm_lib=norm_lib, tot_count_dict=tot_count_dict)
 	
 	# DO NOT USE
 	#if with_control or intv_bin_ip.shape[0]>1:
@@ -384,7 +401,7 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz
 	return BED
 	
 
-def _child_peak_caller( (ip_bam_list, con_bam_list, child_gene_list, gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling) ):
+def _child_peak_caller( (ip_bam_list, con_bam_list, child_gene_list, gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling, norm_lib, tot_count_dict) ):
 	"""DOCSTRING
 	Args
 	Returns
@@ -404,12 +421,66 @@ def _child_peak_caller( (ip_bam_list, con_bam_list, child_gene_list, gene_annot,
 		BED += call_gene_peak(bam_dict, gene, 
 				unique_only=unique_only, with_control=with_control, 
 				unstranded=unstranded, binsize=binsize, qval_cutoff=qval_cutoff, fold_change=fold_change,
-				min_clip_cov=min_clip_cov, pooling=pooling)
+				min_clip_cov=min_clip_cov, pooling=pooling,
+				norm_lib=norm_lib, tot_count_dict=tot_count_dict)
 	# close the handler
 	_ = map(lambda x: x.close(), [bam for x in bam_dict.values() for bam in x])
 	return BED
 
 
+def get_bam_total_reads(ip_bam_list, con_bam_list):
+	tot_count_dict = defaultdict(list)
+	try:
+		ubam_ip, mbam_ip = ip_bam_list
+	except:
+		ubam_ip = ip_bam_list[0]
+		mbam_ip = None
+
+	for bam_fn in ubam_ip.split(','):
+		if not os.path.isfile(bam_fn):
+			raise Exception('%s not found'%bam_fn)
+		tot_count_dict['ubam.ip'].append( get_total_reads(bam_fn) )
+
+	if mbam_ip is not None:
+		for bam_fn in mbam_ip.split(','):
+			if not os.path.isfile(bam_fn):
+				raise Exception('%s not found'%bam_fn)
+		tot_count_dict['mbam.ip'].append( get_total_reads(bam_fn) )
+		
+	if con_bam_list is None:
+		return tot_count_dict
+
+	try:
+		ubam_con, mbam_con = con_bam_list
+	except:
+		ubam_con = con_bam_list[0]
+		mbam_con = None
+
+	for bam_fn in ubam_con.split(','):
+		if not os.path.isfile(bam_fn):
+			raise Exception('%s not found'%bam_fn)
+		tot_count_dict['ubam.con'].append( get_total_reads(bam_fn) )
+
+	if mbam_con is not None:
+		for bam_fn in mbam_con.split(','):
+			if not os.path.isfile(bam_fn):
+				raise Exception('%s not found'%bam_fn)
+		tot_count_dict['mbam.con'].append( get_total_reads(bam_fn) )
+
+	return tot_count_dict
+
+
+def get_total_reads(bam_filename):
+	idxstats  = pysam.idxstats(bam_filename).split('\n')
+	tot = 0
+	for l in idxstats:
+		if not l:
+			continue
+		ele = l.split('\t')
+		tot += int(ele[-2])
+	return tot
+
+
 def make_bam_handler_dict(ip_bam_list, con_bam_list):
 	"""DOCSTRING
 	Args
@@ -452,6 +523,7 @@ def make_bam_handler_dict(ip_bam_list, con_bam_list):
 
 def peakcaller(ip_bam_list, gtf_fp, con_bam_list=None, nthread=8, 
 		out_dir='.', binsize=50,
+		norm_lib=False,
 		unique_only=False, unstranded=True,
 		qval_cutoff=0.05, fold_change=[2.], 
 		min_clip_cov=0, pooling=False):
@@ -486,6 +558,9 @@ def peakcaller(ip_bam_list, gtf_fp, con_bam_list=None, nthread=8,
 	
 	# read in GTF
 	gene_annot = read_gtf(gtf_fp)
+
+	# read total library read number
+	tot_count_dict = get_bam_total_reads(ip_bam_list, con_bam_list)
 	
 	if nthread == 1:
 	# iteratively call peaks in each gene
@@ -495,7 +570,8 @@ def peakcaller(ip_bam_list, gtf_fp, con_bam_list=None, nthread=8,
 			BED = call_gene_peak(bam_dict, gene, 
 				unique_only=unique_only, with_control=with_control, 
 				unstranded=unstranded, binsize=binsize, qval_cutoff=qval_cutoff, fold_change=fold_change,
-				min_clip_cov=min_clip_cov, pooling=pooling)
+				min_clip_cov=min_clip_cov, pooling=pooling,
+				norm_lib=norm_lib, tot_count_dict=tot_count_dict)
 			ofile.write(BED)
 			#print BED
 			peak_counter += len(BED.split('\n'))-1
@@ -511,7 +587,7 @@ def peakcaller(ip_bam_list, gtf_fp, con_bam_list=None, nthread=8,
 		pool=Pool(processes=nthread)
 		BED_list = pool.map(
 			_child_peak_caller, 
-			[(ip_bam_list, con_bam_list, child_gene_list[i], gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling) for i in range(nthread)]
+			[(ip_bam_list, con_bam_list, child_gene_list[i], gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling, norm_lib, tot_count_dict) for i in range(nthread)]
 			)
 		pool.terminate()
 		pool.join()
@@ -551,6 +627,7 @@ def parser(args):
 		binsize = args.binsize
 		qval_cutoff = args.qval_cutoff
 		fold_change = args.fold_change
+		norm_lib = args.norm_lib
 		min_clip_cov = args.min_clip_cov
 		pooling = args.pooling
 		logger = logging.getLogger('CLAM.Peakcaller')
@@ -559,6 +636,7 @@ def parser(args):
 		
 		peakcaller(ip_bam_list, gtf_fp, con_bam_list, nthread, 
 			out_dir=out_dir, binsize=binsize,
+			norm_lib=norm_lib,
 			unique_only=unique_only, unstranded=unstranded,
 			qval_cutoff=qval_cutoff,
 			fold_change=fold_change,
diff --git a/CLAM/permutation_peakcaller.py b/CLAM/permutation_peakcaller.py
index 94b8533..66575d7 100755
--- a/CLAM/permutation_peakcaller.py
+++ b/CLAM/permutation_peakcaller.py
@@ -17,9 +17,8 @@
 Tested under python 2.7.3
 """
 
-__author__ = 'Zijun Zhang'
-__version__ = '1.1.3'
-__email__ = 'zj.z@ucla.edu'
+from . import config
+__version__ = config.__version__
 
 
 import os
@@ -466,11 +465,10 @@ def read_tid_frag_from_bam(tid, bamfile, is_stranded, is_unique):
 					continue
 			except:
 				continue
-		#try:
-		#	read_length = read.qlen
-		#except:
-		#	read_length = read.positions[-1] - read.positions[0] + 1
-		read_length = read.positions[-1] - read.positions[0] + 1
+		try:
+			read_length = read.opt('RL')
+		except:
+			read_length = read.positions[-1] - read.positions[0] + 1
 		
 		if (not 'N' in read.cigarstring) and \
 			(read.pos-start>=0 and read_length<500):    # to avoid junction reads
diff --git a/CLAM/preprocessor.py b/CLAM/preprocessor.py
index 7e40638..503c2c7 100755
--- a/CLAM/preprocessor.py
+++ b/CLAM/preprocessor.py
@@ -20,6 +20,8 @@
 
 Tested under python 2.7
 """
+from . import config
+__version__ = config.__version__
 
 import os
 import sys
@@ -33,9 +35,6 @@
 import argparse as ap
 import inspect
 
-__author__ = 'Zijun Zhang'
-__version__ = '1.1.3'
-__email__ = 'zj.z@ucla.edu'
 
 logger = logging.getLogger('CLAM.Preprocessor')
 
@@ -128,6 +127,8 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth
 			tagged_read.template_length = read.template_length
 			tagged_read.query_qualities = pysam.qualitystring_to_array("<")
 			tagged_read.tags = read.tags
+			read_len = sum([i[1] for i in read.cigar if i[0]==0])
+			tagged_read.tags += [('RL', read_len)]
 
 			# add strandness check
 			if strandness != "none":
diff --git a/CLAM/realigner.py b/CLAM/realigner.py
index 456b2c1..c665bed 100755
--- a/CLAM/realigner.py
+++ b/CLAM/realigner.py
@@ -18,16 +18,15 @@
 Tested under python 2.7
 """
 
-__author__ = 'Zijun Zhang'
-__version__ = '1.1.3'
-__email__ = 'zj.z@ucla.edu'
+from . import config
+__version__ = config.__version__
 
 import os
 import sys
 import pysam
 import numpy as np
 from collections import defaultdict
-from tqdm import tqdm
+#from tqdm import tqdm
 import logging
 import datetime
 import bisect
diff --git a/bin/CLAM b/bin/CLAM
index 07254f2..790ddd6 100755
--- a/bin/CLAM
+++ b/bin/CLAM
@@ -30,10 +30,7 @@ Foundation, either version 3 of the License, or (at your option) any later
 version
 """
 
-__author__ = 'Zijun Zhang'
-__version__ = '1.1.3'
-__email__ = 'zj.z@ucla.edu'
-
+from CLAM import config
 import os
 import sys
 import logging
@@ -109,7 +106,7 @@ def get_arg_parser():
 	epilog = "For command line options of each sub-command, type: %(prog)s COMMAND -h"
 	
 	argparser = ap.ArgumentParser(description=description, epilog=epilog)
-	argparser.add_argument("--version", action="version", version="%(prog)s "+__version__)
+	argparser.add_argument("--version", action="version", version="%(prog)s "+config.__version__)
 	
 	subparsers = argparser.add_subparsers( dest="subcommand" )
 	
@@ -230,6 +227,9 @@ def add_peakcaller_parser( subparsers ):
 	
 	ag_peakcaller.add_argument("--fold-change", dest="fold_change", nargs='+', type=float, default=[2.],
 		help="Threasholds for signal range (fold change w/ control; tag count w/o control); default: 2-inf")
+
+	ag_peakcaller.add_argument("--normalize-lib", dest="norm_lib", action="store_true", default=False,
+		help="use total library size to normalize signal and control, instead of gene-by-gene basis; default: False")
 	
 	ag_peakcaller.add_argument("-b", "--binsize", dest="binsize", type=int, default=50,
 		help="Bin size for calling peaks; default: 50")
diff --git a/check/compare_realign.py b/check/compare_realign.py
new file mode 100644
index 0000000..188194e
--- /dev/null
+++ b/check/compare_realign.py
@@ -0,0 +1,46 @@
+"""Compare the realigner outputs for different
+version
+ZZJ
+2019.5.27
+"""
+
+import sys
+import pysam
+
+
+
+def read_as_score(bfile):
+	s1 = {}
+	with pysam.Samfile(bfile, 'rb') as bam:
+		i = 0
+		for r1 in bam:
+			i += 1
+			#if i>30000:
+			#	break
+			s1[(r1.qname, r1.rname, r1.pos)] = r1.opt('AS')
+	return s1
+
+
+def plot_scatter(new, old):
+	import matplotlib.pyplot as plt
+	import seaborn as sns
+	ax = sns.jointplot(new, old, kind="reg")
+	ax.set_axis_labels('New', 'Old')
+	plt.savefig('realign_check.png')
+
+def compare():
+	s1 = read_as_score('new_out/realigned.sorted.bam')
+	s2 = read_as_score('old_out/realigned.sorted.bam')
+	k = list([x for x in s1 if x in s2])
+	old = []
+	new = []
+	print("ID\tnew\told\n")
+	for k_ in k:
+		print("%s\t%s\t%s\n"%(k_, s1[k_], s2[k_] ) )
+		new.append(s1[k_])
+		old.append(s2[k_])
+	plot_scatter(new, old)
+
+
+if __name__ == '__main__':
+	compare()
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 1188a67..5594479 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,6 @@
 pysam>=0.9.0
 multiprocessing
 statsmodels
-tqdm
\ No newline at end of file
+tqdm
+pybedtools
+mpmath
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c73348e..d67593c 100755
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 def main():
 	setup(name='CLAM',
-		version='1.1.4',
+		version='1.2.0-beta',
 		description='CLIP-seq Analysis of Multi-mapped reads',
 		author='Zijun Zhang',
 		author_email='zj.z@ucla.edu',
@@ -13,7 +13,7 @@ def main():
 		packages=['CLAM', 'CLAM.stats'],
 		scripts=['bin/CLAM'],
 		install_requires=[
-			'pysam>0.12,<0.2',
+			#'pysam>0.12,<0.2',
 			'numpy']
 		 )
 	return

From 15854316716d84fcab2d19761af1d75ba9d7e329 Mon Sep 17 00:00:00 2001
From: wkdeng <dengwankun@gmail.com>
Date: Mon, 17 Jun 2019 17:31:09 -0400
Subject: [PATCH 6/9] Added peak annotator and genome data downloader

---
 CLAM/download_data.py  |  71 +++++++
 CLAM/peak_annotator.py | 145 +++++++++++++
 bin/CLAM               | 467 ++++++++++++++++++++++-------------------
 3 files changed, 472 insertions(+), 211 deletions(-)
 create mode 100644 CLAM/download_data.py
 create mode 100644 CLAM/peak_annotator.py

diff --git a/CLAM/download_data.py b/CLAM/download_data.py
new file mode 100644
index 0000000..e510ce8
--- /dev/null
+++ b/CLAM/download_data.py
@@ -0,0 +1,71 @@
+import os
+import sys
+import subprocess
+import peak_annotator
+
+
+def parser(args):
+    """DOCSTRING
+    Args
+    Returns
+    """
+    try:
+        genome = args.genome
+        download_genome(genome)
+    except KeyboardInterrupt():
+        sys.exit(0)
+
+
+def download_genome(genome):
+    curr_dir = os.path.abspath('.')
+    
+    admin = (os.getuid() == 0)
+    cmd = []
+    home = os.environ['HOME']
+    if admin:
+        profile = '/etc/profile'
+    else:
+        profile = '{home}/.bashrc'.format(home=home)
+    
+    if not os.path.isdir('{home}/.clam_data'.format(home=home)):
+        os.mkdir('{home}/.clam_data'.format(home=home))
+    os.chdir('{home}/.clam_data'.format(home=home))
+
+    if 'CLAM_DAT' not in os.environ or not os.environ['CLAM_DAT'] == '{home}/.clam_data'.format(home=home):
+        cmd.append('echo "export CLAM_DAT=\'{clam_data}\'" >> {profile}'.format(
+            clam_data=os.path.abspath('.'), profile=profile))
+        cmd.append('source {profile}'.format(profile=profile))
+        os.environ['CLAM_DAT'] = os.path.abspath('.')
+
+    if not check_genome_data(genome):
+        cmd.append('chmod -R 755 {home}/.clam_data'.format(home=home))
+        cmd.append(
+            'wget https://raw.githubusercontent.com/wkdeng/clam_data/master/{genome}.zip'.format(genome=genome))
+        cmd.append('unzip -o {genome}.zip'.format(genome=genome))
+        cmd.append('rm  {genome}.zip'.format(genome=genome))
+        for item in cmd:
+            subprocess.call(item, shell=True, executable='/bin/bash')
+        print 'Download finished'    
+    os.chdir(curr_dir)
+
+def check_genome_data(genome):
+    if not os.path.isdir(os.environ['CLAM_DAT'] + '/' + genome):
+        return False
+    if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/3UTRs.bed'):
+        return False
+    if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/5UTRs.bed'):
+        return False
+    if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/cds.bed'):
+        return False
+    if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/exons.bed'):
+        return False
+    if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/introns.bed'):
+        return False
+    if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal200_intron.bed'):
+        return False
+    if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal500_intron.bed'):
+        return False
+    return True
+
+if __name__ == '__main__':
+    download_genome('hg38')
diff --git a/CLAM/peak_annotator.py b/CLAM/peak_annotator.py
new file mode 100644
index 0000000..8d5bf36
--- /dev/null
+++ b/CLAM/peak_annotator.py
@@ -0,0 +1,145 @@
+import sys
+import os
+import pybedtools
+import argparse as ap
+import logging
+import download_data
+
+'''
+Assign peaks to genomic regions
+Zijun Zhang
+8.1.2018
+10.25.2018: wrapped to a function with document
+
+DWK
+modified to output annotation file
+6.12.2019
+'''
+
+# pylint: disable-msg=too-many-function-args
+# pylint: disable-msg=unexpected-keyword-arg
+
+
+def parser(args):
+    """DOCSTRING
+    Args
+    Returns
+    """
+    try:
+        peak_in = args.peak_in
+        genome = args.genome
+        out_file = args.out_file
+        if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome):
+            print "Unable to locate CLAM data folder for genomic regions, will try to download."
+            print "Downloading..."
+            download_data.download_genome(genome)
+        genome_data = os.environ['CLAM_DAT']
+        intersect_gtf_regions(
+            peak_in, out_file, os.path.join(genome_data, genome))
+    except KeyboardInterrupt():
+        sys.exit(0)
+
+
+def intersect_gtf_regions(peak_fp, outfn, gtf_dir):
+    '''function: intersect_gtf_regions(peak_fp, outfn, gtf_dir)
+    Intersect a peak BED file with a list of genomic region annotations (e.g. start/stop codon, UTR, intron),
+    output the peak-region annotations.
+    :param peak_fp: filepath to a BED-format peakquit
+    :param outfn: filepath to output count file, has to end with ".txt"; annotation will be "NNN.annot.txt"
+
+    '''
+    # input arguments
+
+    # make pybedtools objects
+    print "Loading peaks..."
+    peaks = pybedtools.BedTool(peak_fp)
+    print "Peak file loaded."
+    print "Loading genome annotation..."
+    ref_dict = {
+        'exon': pybedtools.BedTool(os.path.join(gtf_dir, 'exons.bed')),
+        '3UTR': pybedtools.BedTool(os.path.join(gtf_dir, '3UTRs.bed')),
+        '5UTR': pybedtools.BedTool(os.path.join(gtf_dir, '5UTRs.bed')),
+        'cds': pybedtools.BedTool(os.path.join(gtf_dir, 'cds.bed')),
+        'intron': pybedtools.BedTool(os.path.join(gtf_dir, 'introns.bed')),
+        'proximal200': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal200_intron.bed')),
+        'proximal500': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal500_intron.bed'))
+    }
+    print "Genome annotation loaded."
+
+    # # process reference for use
+    target = {
+        "3UTR": ref_dict['3UTR'],
+        "5UTR": ref_dict['5UTR'],
+        "CDS": ref_dict['cds'],
+        "other_exon": ref_dict['exon']-ref_dict['3UTR']-ref_dict['5UTR']-ref_dict['cds'],
+        "px200_intron": ref_dict['proximal200'],
+        "px500_intron": ref_dict['proximal500'].subtract(ref_dict['proximal200']),
+        "distal_intron": ref_dict['intron'].subtract(ref_dict['exon']).subtract(ref_dict['proximal500'])
+    }
+    category_list = ['3UTR', '5UTR', 'CDS',
+                     'other_exon', "px200_intron", "px500_intron", "distal_intron"]
+    init = True
+
+    print "Intersecting peaks with genome annotation..."
+    for cat in category_list:
+        bed_arr = []
+        for interval in target[cat]:
+            bed_arr.append('\t'.join([str(x) for x in interval.fields]))
+            bed_arr[-1] = bed_arr[-1] + '\t' + cat
+        bed_arr = list(dict.fromkeys(bed_arr))
+        for i in range(len(bed_arr)):
+            bed_arr[i] = bed_arr[i].split('\t')
+        target[cat] = pybedtools.BedTool(bed_arr)
+
+        if init:
+            init = False
+            result_bed = peaks.intersect(target[cat], wa=True, wb=True)
+        else:
+            result_bed = result_bed.cat(peaks.intersect(
+                target[cat], wa=True, wb=True), postmerge=False)
+    result_bed = result_bed.sort()
+
+    print "Preparing output..."
+    result_bed.saveas(outfn + '_')
+    prepend = ['## Annotation peaks to genomic regions, all intersected genomic regions are presented.',
+               '## CLAM version: 1.2.0',
+               '## Column 1:  Peak chromosome',
+               '## Column 2:  Peak start',
+               '## Column 3:  Peak end',
+               '## Column 4:  Peak name',
+               '## Column 5:  Peak score',
+               '## Column 6:  Peak strand',
+               '## Column 7:  Peak signal value',
+               '## Column 8:  Peak pValue',
+               '## Column 9:  Peak qValue',
+               '## Column 10: Point-source called for this peak',
+               '## Column 11: Genomic region chromosome',
+               '## Column 12: Genomic region start',
+               '## Column 13: Genomic region end',
+               '## Column 14: Gene ID',
+               '## Column 15: Quality score',
+               '## Column 16: Genomic region strand',
+               '## Column 17: Genomic region type']
+    if os.path.exists(outfn):
+        os.remove(outfn)
+    for line in prepend:
+        cmd = 'echo "{prepend}" >> {outfn}'.format(
+            prepend=line, outfn=outfn)
+        os.system(cmd)
+    os.system('cat {outtmp} >> {outfn}'.format(
+        outtmp=outfn + '_', outfn=outfn))
+    os.remove(outfn+'_')
+    print "DONE"
+
+
+if __name__ == '__main__':
+    # peak_fp, genome, outfn = sys.argv[1], sys.argv[2], sys.argv[3]
+    os.chdir('/mnt/h/yi_lab/m6a/src/scripts/peakComposition')
+    peak_in, genome, out_file = 'narrow_peak.unique.bed', 'mm10', 'annotate_peak.bed'
+    if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome):
+        print "Unable to find CLAM data folder for genomic regions, please try to download it using download_genome command."
+        print "Downloading..."
+        download_data.download_genome(genome)
+    genome_data = os.environ['CLAM_DAT']
+    intersect_gtf_regions(
+        peak_in, out_file, os.path.join(genome_data, genome))
diff --git a/bin/CLAM b/bin/CLAM
index a7d0f7a..1deca1c 100755
--- a/bin/CLAM
+++ b/bin/CLAM
@@ -42,235 +42,280 @@ import datetime
 
 
 def main():
-	"""main entry for CLAM
-	This function setup the logging and handle the input options
-	Args
-		None
-	Returns
-		None
-	"""
-	logger = setup_logger()
-	argparser = get_arg_parser()
-	args = argparser.parse_args()
-	
-	subcommand = args.subcommand
-	
-	if subcommand == 'preprocessor':
-		from CLAM import preprocessor
-		preprocessor.parser( args )
-	elif subcommand == 'realigner':
-		from CLAM import realigner
-		#print args
-		realigner.parser( args )
-	
-	elif subcommand == 'peakcaller':
-		from CLAM import peakcaller
-		#print args
-		peakcaller.parser( args )
-	elif subcommand == 'permutation_callpeak':
-		from CLAM import permutation_peakcaller
-		permutation_peakcaller.parser( args )
+    """main entry for CLAM
+    This function setup the logging and handle the input options
+    Args
+            None
+    Returns
+            None
+    """
+    logger = setup_logger()
+    argparser = get_arg_parser()
+    args = argparser.parse_args()
+
+    subcommand = args.subcommand
+
+    if subcommand == 'preprocessor':
+        from CLAM import preprocessor
+        preprocessor.parser(args)
+    elif subcommand == 'realigner':
+        from CLAM import realigner
+        #print args
+        realigner.parser(args)
+
+    elif subcommand == 'peakcaller':
+        from CLAM import peakcaller
+        #print args
+        peakcaller.parser(args)
+    elif subcommand == 'permutation_callpeak':
+        from CLAM import permutation_peakcaller
+        permutation_peakcaller.parser(args)
+    elif subcommand == 'peak_annotator':
+        from CLAM import peak_annotator
+        peak_annotator.parser(args)
+    elif subcommand == 'data_downloader':
+        from CLAM import download_data
+        download_data.parser(args)
 
 
 def setup_logger():
-	"""Set up the logger for the whole pipeline
-	Args
-		None
-	Returns
-		logger: logging object
-	"""
-	# setup logger
-	logger = logging.getLogger('CLAM')
-	logger.setLevel(logging.DEBUG)
-	# create file handler which logs even debug messages
-	#fh = logging.FileHandler(
-	#	'log.CLAM.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.txt')
-	fh = logging.FileHandler('log.CLAM.txt')
-	fh.setLevel(logging.INFO)
-	# create console handler with a higher log level
-	ch = logging.StreamHandler()
-	ch.setLevel(logging.DEBUG)
-	# create formatter and add it to the handlers
-	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -\n %(message)s')
-	fh.setFormatter(formatter)
-	ch.setFormatter(formatter)
-	# add the handlers to the logger
-	logger.addHandler(fh)
-	logger.addHandler(ch)
-	return logger
+    """Set up the logger for the whole pipeline
+    Args
+            None
+    Returns
+            logger: logging object
+    """
+    # setup logger
+    logger = logging.getLogger('CLAM')
+    logger.setLevel(logging.DEBUG)
+    # create file handler which logs even debug messages
+    # fh = logging.FileHandler(
+    #	'log.CLAM.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.txt')
+    fh = logging.FileHandler('log.CLAM.txt')
+    fh.setLevel(logging.INFO)
+    # create console handler with a higher log level
+    ch = logging.StreamHandler()
+    ch.setLevel(logging.DEBUG)
+    # create formatter and add it to the handlers
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s -\n %(message)s')
+    fh.setFormatter(formatter)
+    ch.setFormatter(formatter)
+    # add the handlers to the logger
+    logger.addHandler(fh)
+    logger.addHandler(ch)
+    return logger
 
 
 def get_arg_parser():
-	"""DOCSTRING
-	Args
-	Returns
-	"""
-	description = "%(prog)s -- CLip-seq Analysis of Multi-mapped reads"
-	epilog = "For command line options of each sub-command, type: %(prog)s COMMAND -h"
-	
-	argparser = ap.ArgumentParser(description=description, epilog=epilog)
-	argparser.add_argument("--version", action="version", version="%(prog)s "+__version__)
-	
-	subparsers = argparser.add_subparsers( dest="subcommand" )
-	
-	# preprocessing
-	add_preprocessor_parser(subparsers)
-	
-	# realigner
-	add_realigner_parser(subparsers)
-	
-	# peakcaller
-	add_peakcaller_parser(subparsers)
-	
-	# permutation_callpeak
-	add_permutation_callpeak_parser(subparsers)
-	
-	return argparser
+    """DOCSTRING
+    Args
+    Returns
+    """
+    description = "%(prog)s -- CLip-seq Analysis of Multi-mapped reads"
+    epilog = "For command line options of each sub-command, type: %(prog)s COMMAND -h"
 
+    argparser = ap.ArgumentParser(description=description, epilog=epilog)
+    argparser.add_argument("--version", action="version",
+                           version="%(prog)s "+__version__)
 
-def add_preprocessor_parser( subparsers ):
-	ag_prep = subparsers.add_parser("preprocessor", help="CLAM Preprocessor: tag read alignments to specific locus")
-	
-	# input/output
-	ag_prep.add_argument("-i", "--input", dest="in_bam", type=str, required=True,
-		help="Input bam file")
-	
-	ag_prep.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
-		help="Output folder")
-	
-	# processing
-	ag_prep.add_argument("--read-tagger-method", dest="tag_method", type=str, 
-		choices= ('median', 'start'), default='median',
-		help="Read tagger method, 'median' for read center, 'start' for read start site; default: median")
-	
-	ag_prep.add_argument("--max-multihits", dest="max_hits", type=int, default=100,
-		help="The maximum hits allowed for multi-mapped reads; default: 100")
-	
-	ag_prep.add_argument("--max-tags", dest="max_tags", type=int, default=-1,
-		help="The maximum identical tags at given location; default: -1, no filter")
-	
-	return
+    subparsers = argparser.add_subparsers(dest="subcommand")
 
+    # preprocessing
+    add_preprocessor_parser(subparsers)
 
-def add_realigner_parser( subparsers ):
-	ag_realigner = subparsers.add_parser("realigner", help="CLAM Realigner: realign multi-mapped reads using expectation-maximization")
-	
-	# input/output
-	ag_realigner.add_argument("-i", "--input", dest="in_bam", type=str, required=True,
-		help="Input bam file")
-	
-	ag_realigner.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
-		help="Output folder")
-	
-	# processing
-	ag_realigner.add_argument("--read-tagger-method", dest="tag_method", type=str, 
-		choices= ('median', 'start'), default='median',
-		help="Read tagger method, 'median' for read center, 'start' for read start site; default: median")
-	
-	ag_realigner.add_argument("--max-multihits", dest="max_hits", type=int, default=100,
-		help="The maximum hits allowed for multi-mapped reads; default: 100")
+    # realigner
+    add_realigner_parser(subparsers)
 
-	ag_realigner.add_argument("--max-tags", dest="max_tags", type=int, default=-1,
-		help="The maximum identical tags at given location; default: -1, no filter")
+    # peakcaller
+    add_peakcaller_parser(subparsers)
 
-	ag_realigner.add_argument("--retag", dest="retag", default=False, action='store_true',
-		help="Retag the bam regardless when turned on; invalid when no previous files found")
-	
-	# realign
-	ag_realigner.add_argument("--winsize", dest="winsize", type=int, default=50,
-		help="Local window size for em computations; default: 50")
+    # permutation_callpeak
+    add_permutation_callpeak_parser(subparsers)
 
-	ag_realigner.add_argument("--unstranded", dest="unstranded", default=False, action="store_true",
-		help="Unstranded alignments if turned on")
-	
-	return
+    # peak_annotator
+    add_peak_annotator_parser(subparsers)
 
+    # data_downloader
+    add_data_downloader_parser(subparsers)
 
-def add_peakcaller_parser( subparsers ):
-	ag_peakcaller = subparsers.add_parser("peakcaller", help="CLAM Peakcaller: negative binomial model-based peak calling combining unique- and multi-reads")
-	
-	# input/output
-	ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True,
-		help="Filepaths for IP bam files, e.g ubam1,ubam2 mbam1,mbam2")
-	
-	ag_peakcaller.add_argument("-c", "--control-dir", dest="con_bam", nargs='+', type=str, required=True,
-		help="Filepaths for control bam files")
-	
-	ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
-		help="Output folder")
-	
-	ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True,
-		help="GTF filepath")
-	
-	# processing
-	ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8,
-		help="Number of threads; default: 8")
-	
-	ag_peakcaller.add_argument("-u", "--unique-only", dest="unique_only", default=False, action='store_true',
-		help="Call peaks using only unique-mapped reads when turned on")
-	
-	ag_peakcaller.add_argument("--pool", dest="pooling", default=False, action="store_true",
-		help="Pool the read counts if provided with multiple replicates; default: False")
-	
-	ag_peakcaller.add_argument("--min-clip-cov", dest="min_clip_cov", type=int, default=4,
-		help="Minimum CLIP reads per gene to perform analysis; default: 4")
-	
-	# callpeak
-	ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.05,
-		help="Cutoff for adjusted p-values; default: 0.05")
-	
-	ag_peakcaller.add_argument("--fold-change", dest="fold_change", nargs='+', type=float, default=[2.],
-		help="Threasholds for signal range (fold change w/ control; tag count w/o control); default: 2-inf")
-	
-	ag_peakcaller.add_argument("-b", "--binsize", dest="binsize", type=int, default=50,
-		help="Bin size for calling peaks; default: 50")
-	
-	ag_peakcaller.add_argument("--unstranded", dest="unstranded", action="store_true", default=False,
-		help="Unstranded alignments if turned on")
-	
-	return
+    return argparser
 
 
+def add_preprocessor_parser(subparsers):
+    ag_prep = subparsers.add_parser(
+        "preprocessor", help="CLAM Preprocessor: tag read alignments to specific locus")
 
-def add_permutation_callpeak_parser( subparsers ):
-	ag_peakcaller = subparsers.add_parser("permutation_callpeak", help="CLAM permutation peakcaller: call peaks using permutation (as in v1.0.0)")
-	
-	# input/output
-	ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True,
-		help="Filepaths for CLIP bam, e.g ubam mbam")
-		
-	ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
-		help="Output folder")
-	
-	ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True,
-		help="GTF filepath")
-	
-	# processing
-	ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8,
-		help="Number of threads; default: 8")
-	
-	ag_peakcaller.add_argument("--random-state", dest="random_state", type=int, default=777,
-		help="Seed for random number generator in permutations; default: 777")
-	
-	# callpeak
-	ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.005,
-		help="Cutoff for adjusted p-values; default: 0.005")
-	
-	ag_peakcaller.add_argument("--merge-size", dest="merge_size", type=int, default=50,
-		help="Select best peak within this size; default: 50")
-	
-	ag_peakcaller.add_argument("--extend", dest="extend", type=int, default=50,
-		help="Extend peak to this size if less than it; default: 50")
-	
-	ag_peakcaller.add_argument("--unstranded", dest="unstranded", action="store_true", default=False,
-		help="Unstranded alignments if turned on")
-	
-	return
+    # input/output
+    ag_prep.add_argument("-i", "--input", dest="in_bam", type=str, required=True,
+                         help="Input bam file")
+
+    ag_prep.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
+                         help="Output folder")
+
+    # processing
+    ag_prep.add_argument("--read-tagger-method", dest="tag_method", type=str,
+                         choices=('median', 'start'), default='median',
+                         help="Read tagger method, 'median' for read center, 'start' for read start site; default: median")
+
+    ag_prep.add_argument("--max-multihits", dest="max_hits", type=int, default=100,
+                         help="The maximum hits allowed for multi-mapped reads; default: 100")
+
+    ag_prep.add_argument("--max-tags", dest="max_tags", type=int, default=-1,
+                         help="The maximum identical tags at given location; default: -1, no filter")
+
+    return
+
+
+def add_realigner_parser(subparsers):
+    ag_realigner = subparsers.add_parser(
+        "realigner", help="CLAM Realigner: realign multi-mapped reads using expectation-maximization")
+
+    # input/output
+    ag_realigner.add_argument("-i", "--input", dest="in_bam", type=str, required=True,
+                              help="Input bam file")
+
+    ag_realigner.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
+                              help="Output folder")
+
+    # processing
+    ag_realigner.add_argument("--read-tagger-method", dest="tag_method", type=str,
+                              choices=('median', 'start'), default='median',
+                              help="Read tagger method, 'median' for read center, 'start' for read start site; default: median")
+
+    ag_realigner.add_argument("--max-multihits", dest="max_hits", type=int, default=100,
+                              help="The maximum hits allowed for multi-mapped reads; default: 100")
+
+    ag_realigner.add_argument("--max-tags", dest="max_tags", type=int, default=-1,
+                              help="The maximum identical tags at given location; default: -1, no filter")
+
+    ag_realigner.add_argument("--retag", dest="retag", default=False, action='store_true',
+                              help="Retag the bam regardless when turned on; invalid when no previous files found")
+
+    # realign
+    ag_realigner.add_argument("--winsize", dest="winsize", type=int, default=50,
+                              help="Local window size for em computations; default: 50")
+
+    ag_realigner.add_argument("--unstranded", dest="unstranded", default=False, action="store_true",
+                              help="Unstranded alignments if turned on")
+
+    return
+
+
+def add_peakcaller_parser(subparsers):
+    ag_peakcaller = subparsers.add_parser(
+        "peakcaller", help="CLAM Peakcaller: negative binomial model-based peak calling combining unique- and multi-reads")
+
+    # input/output
+    ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True,
+                               help="Filepaths for IP bam files, e.g ubam1,ubam2 mbam1,mbam2")
+
+    ag_peakcaller.add_argument("-c", "--control-dir", dest="con_bam", nargs='+', type=str, required=True,
+                               help="Filepaths for control bam files")
+
+    ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
+                               help="Output folder")
+
+    ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True,
+                               help="GTF filepath")
+
+    # processing
+    ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8,
+                               help="Number of threads; default: 8")
+
+    ag_peakcaller.add_argument("-u", "--unique-only", dest="unique_only", default=False, action='store_true',
+                               help="Call peaks using only unique-mapped reads when turned on")
+
+    ag_peakcaller.add_argument("--pool", dest="pooling", default=False, action="store_true",
+                               help="Pool the read counts if provided with multiple replicates; default: False")
+
+    ag_peakcaller.add_argument("--min-clip-cov", dest="min_clip_cov", type=int, default=4,
+                               help="Minimum CLIP reads per gene to perform analysis; default: 4")
+
+    # callpeak
+    ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.05,
+                               help="Cutoff for adjusted p-values; default: 0.05")
+
+    ag_peakcaller.add_argument("--fold-change", dest="fold_change", nargs='+', type=float, default=[2.],
+                               help="Threasholds for signal range (fold change w/ control; tag count w/o control); default: 2-inf")
+
+    ag_peakcaller.add_argument("-b", "--binsize", dest="binsize", type=int, default=50,
+                               help="Bin size for calling peaks; default: 50")
+
+    ag_peakcaller.add_argument("--unstranded", dest="unstranded", action="store_true", default=False,
+                               help="Unstranded alignments if turned on")
+
+    return
+
+
+def add_permutation_callpeak_parser(subparsers):
+    ag_peakcaller = subparsers.add_parser(
+        "permutation_callpeak", help="CLAM permutation peakcaller: call peaks using permutation (as in v1.0.0)")
+
+    # input/output
+    ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True,
+                               help="Filepaths for CLIP bam, e.g ubam mbam")
+
+    ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True,
+                               help="Output folder")
+
+    ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True,
+                               help="GTF filepath")
+
+    # processing
+    ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8,
+                               help="Number of threads; default: 8")
+
+    ag_peakcaller.add_argument("--random-state", dest="random_state", type=int, default=777,
+                               help="Seed for random number generator in permutations; default: 777")
+
+    # callpeak
+    ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.005,
+                               help="Cutoff for adjusted p-values; default: 0.005")
+
+    ag_peakcaller.add_argument("--merge-size", dest="merge_size", type=int, default=50,
+                               help="Select best peak within this size; default: 50")
+
+    ag_peakcaller.add_argument("--extend", dest="extend", type=int, default=50,
+                               help="Extend peak to this size if less than it; default: 50")
+
+    ag_peakcaller.add_argument("--unstranded", dest="unstranded", action="store_true", default=False,
+                               help="Unstranded alignments if turned on")
+
+    return
+
+
+def add_peak_annotator_parser(subparsers):
+    ag_anno = subparsers.add_parser(
+        "peak_annotator", help="CLAM peak annotator: assign peaks to genomic regions")
+
+    # input/output
+    ag_anno.add_argument("-i", "--input", dest="peak_in", type=str, required=True,
+                         help="Input peak file")
+
+    ag_anno.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True,
+                         help="Genome version (hg19, hg38, mm10 avaiable)")
+
+    ag_anno.add_argument("-o", "--out-file", dest="out_file", type=str, required=True,
+                         help="Output file")
+
+    return
+
+
+def add_data_downloader_parser(subparsers):
+    ag_down = subparsers.add_parser(
+        "data_downloader", help="CLAM data downloader: download data of genomic regions")
+
+    # input/output
+    ag_down.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True,
+                         help="Genome version (hg19, hg38, mm10 avaiable)")
+
+    return
 
 
 if __name__ == '__main__':
-	try:
-		main()
-	except KeyboardInterrupt:
-		sys.stderr.write("User interrupted; program terminated.")
-		sys.exit(0)
\ No newline at end of file
+    try:
+        main()
+    except KeyboardInterrupt:
+        sys.stderr.write("User interrupted; program terminated.")
+        sys.exit(0)

From d4ae4dbc616bda6af32f8449df648b4ca938f9d8 Mon Sep 17 00:00:00 2001
From: zj-zhang <zj.z@ucla.edu>
Date: Fri, 28 Jun 2019 03:06:29 -0400
Subject: [PATCH 7/9] update python3 compatible

---
 CLAM/peakcaller.py    | 12 +++++++-----
 CLAM/realigner.py     | 33 ++++++++++++++++++++++-----------
 CLAM/stats/ztnb_em.py |  2 +-
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/CLAM/peakcaller.py b/CLAM/peakcaller.py
index 8c0680c..98f71f4 100755
--- a/CLAM/peakcaller.py
+++ b/CLAM/peakcaller.py
@@ -225,7 +225,7 @@ def _neg_loglik_constrain(par, data):
 	for i in range(intv_bin_ip.shape[0]):
 		height = ztnb_em.collapse_data(np.floor(intv_bin_ip[i,]))
 		height[0] = 0
-		if np.sum(height.values())>0:
+		if np.sum([x for x in height.values()])>0:
 			try:
 				ll, mu, alpha = ztnb_em.EM_estim_params(height, max_iter=10, verbose=False)
 			except:
@@ -401,11 +401,13 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz
 	return BED
 	
 
-def _child_peak_caller( (ip_bam_list, con_bam_list, child_gene_list, gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling, norm_lib, tot_count_dict) ):
+#def _child_peak_caller(ip_bam_list, con_bam_list, child_gene_list, gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov ):
+def _child_peak_caller(args):
 	"""DOCSTRING
 	Args
 	Returns
 	"""
+	ip_bam_list, con_bam_list, child_gene_list, gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling, norm_lib, tot_count_dict = args
 	# open file handler for child process
 	bam_dict = make_bam_handler_dict(ip_bam_list, con_bam_list)
 	
@@ -582,7 +584,7 @@ def peakcaller(ip_bam_list, gtf_fp, con_bam_list=None, nthread=8,
 	else:
 	# multi-threading on subset of genes
 		logger.info('multi-threading')
-		gene_list = gene_annot.keys()
+		gene_list = list(gene_annot.keys())
 		child_gene_list = [x for x in chunkify(gene_list, nthread)]
 		pool=Pool(processes=nthread)
 		BED_list = pool.map(
@@ -606,7 +608,7 @@ def chunkify(a, n):
 		the chunkified index
 	"""
 	k, m = len(a) / n, len(a) % n
-	return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n))
+	return (a[ int(i * k + min(i, m)) :int((i + 1) * k + min(i + 1, m))] for i in range(n))
 
 
 def parser(args):
@@ -644,7 +646,7 @@ def parser(args):
 			pooling=pooling)
 		
 		logger.info('end')
-	except KeyboardInterrupt():
+	except KeyboardInterrupt:
 		sys.exit(0)
 	return
 
diff --git a/CLAM/realigner.py b/CLAM/realigner.py
index c665bed..09cf8cf 100755
--- a/CLAM/realigner.py
+++ b/CLAM/realigner.py
@@ -25,7 +25,7 @@
 import sys
 import pysam
 import numpy as np
-from collections import defaultdict
+from collections import defaultdict, deque
 #from tqdm import tqdm
 import logging
 import datetime
@@ -205,7 +205,7 @@ def build_read_cluster(alignment, chr_dict, location_to_reads, genomic_cluster_d
 	chr_strand = chrom+':'+strand
 	idx = bisect.bisect_right(genomic_cluster_dict[chr_strand], site)
 	if not idx%2:
-		print alignment
+		print(alignment)
 		raise Exception('%s falls out of region %s'%(alignment.qname, chr_strand+':'+str(site)) )
 	start = genomic_cluster_dict[chr_strand][idx-1] - winsize
 	start = 1 if start<1 else start
@@ -216,11 +216,12 @@ def build_read_cluster(alignment, chr_dict, location_to_reads, genomic_cluster_d
 	## fetch the reads
 	cluster_name = ':'.join([chrom, strand, str(genomic_cluster_dict[chr_strand][idx-1]), str(genomic_cluster_dict[chr_strand][idx])])
 	if not cluster_name in location_to_reads:
-		raise Exception("cannot find cluster %s in `location_to_reads`"%cluster_name)
+		raise Exception("cannot find cluster '%s' associated with read '%s' in `location_to_reads` of len %i"%(cluster_name, alignment.qname, len(location_to_reads)))
 	mread_list = location_to_reads[cluster_name]
-	del location_to_reads[cluster_name]
+	#print(alignment, cluster_name)
 	for x in mread_list:
 		this_mread_dict_set[x.qname].add(x)
+	del location_to_reads[cluster_name]
 	
 	## find other mreads in this cluster
 	for read_x_qname in this_mread_dict_set:
@@ -228,7 +229,7 @@ def build_read_cluster(alignment, chr_dict, location_to_reads, genomic_cluster_d
 			discarded_mread_alignments.extend( [ x for x in list(this_mread_dict_set[read_x_qname]) ])
 		else:
 			this_mread_dict[read_x_qname] = list(this_mread_dict_set[read_x_qname])[0]
-	
+
 	return genomic_cluster, this_mread_dict, discarded_mread_alignments
 
 
@@ -275,8 +276,13 @@ def construct_subgraph(location_to_reads, read_qname, mread_dict, processed_mrea
 			## record those discarded alignments/reads
 			## note: we mark discarded_mread as processed as well,
 			## so as not to create a bias to less clustered regions.
-			_ = map(processed_mread_alignments.add, discarded_mread_list)
-			_ = map(processed_mreads.add, [x.qname for x in discarded_mread_list])
+			# THIS IS PYTHON3 INCOMPATIBLE
+			#_ = map(processed_mread_alignments.add, discarded_mread_list)
+			#_ = map(processed_mreads.add, [x.qname for x in discarded_mread_list])
+			for x in discarded_mread_list:
+				processed_mread_alignments.add(x)
+			for x in discarded_mread_list:
+				processed_mreads.add(x.qname)
 			if genomic_cluster is None:  # this cluster is invald (only double-mappers)
 				continue
 			
@@ -291,7 +297,10 @@ def construct_subgraph(location_to_reads, read_qname, mread_dict, processed_mrea
 			
 			## then add new alignments(edges) to generate connected nodes
 			## in the next iteration
-			_ = map(processed_mread_alignments.add, this_mread_dict.values())
+			# THIS IS PYTHON3 INCOMPATIBLE
+			#_ = map(processed_mread_alignments.add, this_mread_dict.values())
+			for x in list(this_mread_dict.values()):
+				processed_mread_alignments.add(x)
 			for read_x_qname in this_mread_dict:
 				if read_x_qname in processed_mreads:
 					continue
@@ -300,7 +309,9 @@ def construct_subgraph(location_to_reads, read_qname, mread_dict, processed_mrea
 			
 			## .. and record to processed reads since we have generated
 			## the nodes for them
-			_ = map(processed_mreads.add, this_mread_dict.keys())
+			#_ = map(processed_mreads.add, this_mread_dict.keys())  # this is python3 incompatible
+			for x in list(this_mread_dict.keys()):
+				processed_mreads.add(x)
 		
 		# if no more connected nodes can be found, break loop 
 		if len(next_read_aln_list)==0:
@@ -451,7 +462,7 @@ def realigner(in_bam, out_dir, max_hits=100, max_tags=-1, read_tagger_method='me
 				genomic_cluster_dict, winsize=winsize, unstranded=unstranded)
 		subgraph = set()
 		for read in read_to_locations:
-			_ = map(subgraph.add, read_to_locations[read].keys())
+			_ = deque(map(subgraph.add, read_to_locations[read].keys()))
 		subgraph = list(subgraph)
 		#if len(subgraph)==1 and len(read_to_locations)>10:
 		#	raise Exception('Incorrect mread assigned to one location')
@@ -516,7 +527,7 @@ def parser(args):
 			winsize=winsize, unstranded=unstranded, retag=retag, strandness=strandness)
 		
 		logger.info('end')
-	except KeyboardInterrupt():
+	except KeyboardInterrupt:
 		sys.exit(0)
 	return
 	
diff --git a/CLAM/stats/ztnb_em.py b/CLAM/stats/ztnb_em.py
index c3135a6..85001c8 100755
--- a/CLAM/stats/ztnb_em.py
+++ b/CLAM/stats/ztnb_em.py
@@ -89,7 +89,7 @@ def EM_estim_params(height, tol = 10**-4, max_iter = 1000, verbose = False, mu =
 			raise ZeroDivisionError('invalid loglik function value')
 		error = abs((score - prev_score)/score)
 		if verbose:
-			print 'Iter ' + str(i) + ': eps = ' + str(error) + '; mu = ' + str(mu) + '; alpha = ' + str(alpha)
+			print('Iter ' + str(i) + ': eps = ' + str(error) + '; mu = ' + str(mu) + '; alpha = ' + str(alpha))
 		if(error < tol):
 			break
 		prev_score = score

From e7a82ed4ebda2bbe509f0fd310f27f24bb6acf6f Mon Sep 17 00:00:00 2001
From: zj-zhang <zj.z@ucla.edu>
Date: Wed, 10 Jul 2019 13:39:54 -0400
Subject: [PATCH 8/9] fix foldchange formatting in py3

---
 CLAM/peakcaller.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/CLAM/peakcaller.py b/CLAM/peakcaller.py
index 98f71f4..0e9d37a 100755
--- a/CLAM/peakcaller.py
+++ b/CLAM/peakcaller.py
@@ -185,13 +185,13 @@ def _neg_loglik_constrain(par, data):
 	alpha_ip_vec = np.empty(intv_bin_ip.shape[0])
 	alpha_con_vec = np.empty(intv_bin_con.shape[0])
 	if norm_lib and tot_count_dict is not None:
-		try:
+		if 'mbam.ip' in tot_count_dict:
 			ip_sum = np.array(tot_count_dict['ubam.ip']) + np.array(tot_count_dict['mbam.ip'] )
-		except KeyError:
+		else:
 			ip_sum = np.array(tot_count_dict['ubam.ip'])
-		try:
+		if 'mbam.con' in tot_count_dict:
 			con_sum = np.array(tot_count_dict['ubam.con']) + np.array(tot_count_dict['mbam.con'])
-		except KeyError:
+		else:
 			con_sum = np.array(tot_count_dict['ubam.con'])
 	else:
 		ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip)
@@ -375,7 +375,7 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz
 	## "narrowPeak" format from 
 	## https://genome.ucsc.edu/FAQ/FAQformat.html#format12
 	## chr start end name 1000 strand signalValue pVal qVal peak
-	narrowPeak_formatter = "%s\t%i\t%i\t%s\t1000\t%s\t%s\t%.3e\t%.3e\t.\n"
+	narrowPeak_formatter = "%s\t%i\t%i\t%s\t1000\t%s\t%.3f\t%.3e\t%.3e\t.\n"
 	BED = ''
 	if len(fold_change)==1:
 		lb = np.log(fold_change[0]) if with_control else fold_change[0]

From e55fe4f11a9a95eeae76a2d705917e439cab6528 Mon Sep 17 00:00:00 2001
From: zj-zhang <zj.z@ucla.edu>
Date: Tue, 23 Jul 2019 19:31:28 -0400
Subject: [PATCH 9/9] merged for py3 compatible

---
 CLAM/config.py         |  2 +-
 CLAM/download_data.py  |  3 +--
 CLAM/peak_annotator.py | 26 +++++++++---------
 CLAM/peakcaller.py     |  4 ++-
 bin/CLAM               | 60 +++++++++++++++++++++---------------------
 setup.py               | 14 +++++++---
 6 files changed, 58 insertions(+), 51 deletions(-)

diff --git a/CLAM/config.py b/CLAM/config.py
index dcc32a5..711e924 100644
--- a/CLAM/config.py
+++ b/CLAM/config.py
@@ -3,6 +3,6 @@
 """ General Version and other info
 """
 
-__version__ = '1.2.0-beta'
+__version__ = '1.2.0'
 __author__ = 'Zijun Zhang'
 __email__ = 'zj.z@ucla.edu'
\ No newline at end of file
diff --git a/CLAM/download_data.py b/CLAM/download_data.py
index e510ce8..3f8cbb8 100644
--- a/CLAM/download_data.py
+++ b/CLAM/download_data.py
@@ -1,7 +1,6 @@
 import os
 import sys
 import subprocess
-import peak_annotator
 
 
 def parser(args):
@@ -45,7 +44,7 @@ def download_genome(genome):
         cmd.append('rm  {genome}.zip'.format(genome=genome))
         for item in cmd:
             subprocess.call(item, shell=True, executable='/bin/bash')
-        print 'Download finished'    
+        print('Download finished')    
     os.chdir(curr_dir)
 
 def check_genome_data(genome):
diff --git a/CLAM/peak_annotator.py b/CLAM/peak_annotator.py
index 8d5bf36..3907b23 100644
--- a/CLAM/peak_annotator.py
+++ b/CLAM/peak_annotator.py
@@ -3,7 +3,7 @@
 import pybedtools
 import argparse as ap
 import logging
-import download_data
+from . import download_data, config
 
 '''
 Assign peaks to genomic regions
@@ -30,8 +30,8 @@ def parser(args):
         genome = args.genome
         out_file = args.out_file
         if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome):
-            print "Unable to locate CLAM data folder for genomic regions, will try to download."
-            print "Downloading..."
+            print("Unable to locate CLAM data folder for genomic regions, will try to download.")
+            print("Downloading...")
             download_data.download_genome(genome)
         genome_data = os.environ['CLAM_DAT']
         intersect_gtf_regions(
@@ -51,10 +51,10 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir):
     # input arguments
 
     # make pybedtools objects
-    print "Loading peaks..."
+    print("Loading peaks...")
     peaks = pybedtools.BedTool(peak_fp)
-    print "Peak file loaded."
-    print "Loading genome annotation..."
+    print("Peak file loaded.")
+    print("Loading genome annotation...")
     ref_dict = {
         'exon': pybedtools.BedTool(os.path.join(gtf_dir, 'exons.bed')),
         '3UTR': pybedtools.BedTool(os.path.join(gtf_dir, '3UTRs.bed')),
@@ -64,7 +64,7 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir):
         'proximal200': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal200_intron.bed')),
         'proximal500': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal500_intron.bed'))
     }
-    print "Genome annotation loaded."
+    print("Genome annotation loaded.")
 
     # # process reference for use
     target = {
@@ -80,7 +80,7 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir):
                      'other_exon', "px200_intron", "px500_intron", "distal_intron"]
     init = True
 
-    print "Intersecting peaks with genome annotation..."
+    print("Intersecting peaks with genome annotation...")
     for cat in category_list:
         bed_arr = []
         for interval in target[cat]:
@@ -99,10 +99,10 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir):
                 target[cat], wa=True, wb=True), postmerge=False)
     result_bed = result_bed.sort()
 
-    print "Preparing output..."
+    print("Preparing output...")
     result_bed.saveas(outfn + '_')
     prepend = ['## Annotation peaks to genomic regions, all intersected genomic regions are presented.',
-               '## CLAM version: 1.2.0',
+               '## CLAM version: %s'%config.__version__,
                '## Column 1:  Peak chromosome',
                '## Column 2:  Peak start',
                '## Column 3:  Peak end',
@@ -129,7 +129,7 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir):
     os.system('cat {outtmp} >> {outfn}'.format(
         outtmp=outfn + '_', outfn=outfn))
     os.remove(outfn+'_')
-    print "DONE"
+    print("DONE")
 
 
 if __name__ == '__main__':
@@ -137,8 +137,8 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir):
     os.chdir('/mnt/h/yi_lab/m6a/src/scripts/peakComposition')
     peak_in, genome, out_file = 'narrow_peak.unique.bed', 'mm10', 'annotate_peak.bed'
     if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome):
-        print "Unable to find CLAM data folder for genomic regions, please try to download it using download_genome command."
-        print "Downloading..."
+        print("Unable to find CLAM data folder for genomic regions, please try to download it using download_genome command.")
+        print("Downloading...")
         download_data.download_genome(genome)
     genome_data = os.environ['CLAM_DAT']
     intersect_gtf_regions(
diff --git a/CLAM/peakcaller.py b/CLAM/peakcaller.py
index 0e9d37a..d512b7d 100755
--- a/CLAM/peakcaller.py
+++ b/CLAM/peakcaller.py
@@ -375,7 +375,7 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz
 	## "narrowPeak" format from 
 	## https://genome.ucsc.edu/FAQ/FAQformat.html#format12
 	## chr start end name 1000 strand signalValue pVal qVal peak
-	narrowPeak_formatter = "%s\t%i\t%i\t%s\t1000\t%s\t%.3f\t%.3e\t%.3e\t.\n"
+	narrowPeak_formatter = "%s\t%i\t%i\t%s\t1000\t%s\t%s\t%.3e\t%.3e\t.\n"
 	BED = ''
 	if len(fold_change)==1:
 		lb = np.log(fold_change[0]) if with_control else fold_change[0]
@@ -397,6 +397,8 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz
 			strand = gene[3]
 			peak_num += 1
 			peak_name = gene[4] + '-%i'%peak_num
+			if with_control:
+				signal = "%.3f"%(float(signal))
 			BED += narrowPeak_formatter % (chr, binstart, binend, peak_name, strand, signal, pval, qval)
 	return BED
 	
diff --git a/bin/CLAM b/bin/CLAM
index d93ce03..ef9fe85 100755
--- a/bin/CLAM
+++ b/bin/CLAM
@@ -65,17 +65,17 @@ def main():
 		#print args
 		peakcaller.parser( args )
 
-  elif subcommand == 'permutation_callpeak':
+	elif subcommand == 'permutation_callpeak':
 		from CLAM import permutation_peakcaller
 		permutation_peakcaller.parser( args )
-  
-  elif subcommand == 'peak_annotator':
-        from CLAM import peak_annotator
-        peak_annotator.parser(args)
-  
-  elif subcommand == 'data_downloader':
-        from CLAM import download_data
-        download_data.parser(args)
+
+	elif subcommand == 'peak_annotator':
+		from CLAM import peak_annotator
+		peak_annotator.parser(args)
+
+	elif subcommand == 'data_downloader':
+		from CLAM import download_data
+		download_data.parser(args)
 
 
 def setup_logger():
@@ -131,12 +131,12 @@ def get_arg_parser():
 	# permutation_callpeak
 	add_permutation_callpeak_parser(subparsers)
   
-  # peak_annotator
-  add_peak_annotator_parser(subparsers)
+	# peak_annotator
+	add_peak_annotator_parser(subparsers)
+
+	# data_downloader
+	add_data_downloader_parser(subparsers)
 
-  # data_downloader
-  add_data_downloader_parser(subparsers)
-	
 	return argparser
 
 
@@ -293,31 +293,31 @@ def add_permutation_callpeak_parser( subparsers ):
 
 
 def add_peak_annotator_parser(subparsers):
-    ag_anno = subparsers.add_parser(
-        "peak_annotator", help="CLAM peak annotator: assign peaks to genomic regions")
+	ag_anno = subparsers.add_parser(
+		"peak_annotator", help="CLAM peak annotator: assign peaks to genomic regions")
 
-    # input/output
-    ag_anno.add_argument("-i", "--input", dest="peak_in", type=str, required=True,
-                         help="Input peak file")
+	# input/output
+	ag_anno.add_argument("-i", "--input", dest="peak_in", type=str, required=True,
+						help="Input peak file")
 
-    ag_anno.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True,
-                         help="Genome version (hg19, hg38, mm10 avaiable)")
+	ag_anno.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True,
+						help="Genome version (hg19, hg38, mm10 avaiable)")
 
-    ag_anno.add_argument("-o", "--out-file", dest="out_file", type=str, required=True,
-                         help="Output file")
+	ag_anno.add_argument("-o", "--out-file", dest="out_file", type=str, required=True,
+						help="Output file")
 
-    return
+	return
 
 
 def add_data_downloader_parser(subparsers):
-    ag_down = subparsers.add_parser(
-        "data_downloader", help="CLAM data downloader: download data of genomic regions")
+	ag_down = subparsers.add_parser(
+		"data_downloader", help="CLAM data downloader: download data of genomic regions")
 
-    # input/output
-    ag_down.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True,
-                         help="Genome version (hg19, hg38, mm10 avaiable)")
+	# input/output
+	ag_down.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True,
+		help="Genome version (hg19, hg38, mm10 avaiable)")
 
-    return
+	return
 
 
 
diff --git a/setup.py b/setup.py
index d67593c..5c3ff2f 100755
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python
 
 from setuptools import setup
-
+from CLAM.config import __version__
 
 def main():
 	setup(name='CLAM',
-		version='1.2.0-beta',
+		version=__version__,
 		description='CLIP-seq Analysis of Multi-mapped reads',
 		author='Zijun Zhang',
 		author_email='zj.z@ucla.edu',
@@ -13,8 +13,14 @@ def main():
 		packages=['CLAM', 'CLAM.stats'],
 		scripts=['bin/CLAM'],
 		install_requires=[
-			#'pysam>0.12,<0.2',
-			'numpy']
+			'scipy',
+			'pysam',
+			'numpy',
+			'multiprocessing',
+			'statsmodels',
+			'tqdm',
+			'pybedtools'
+			'mpmath']
 		 )
 	return