From 3b7aac61a039644df83677771ef678ad812215b6 Mon Sep 17 00:00:00 2001 From: zj-zhang Date: Tue, 26 Feb 2019 23:54:38 -0800 Subject: [PATCH 1/9] test new preprocess --- CLAM/preprocessor.py | 44 ++++++++++++++++++++++---------------------- setup.py | 17 ++++++++++------- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/CLAM/preprocessor.py b/CLAM/preprocessor.py index 1f94f2a..420eb4c 100755 --- a/CLAM/preprocessor.py +++ b/CLAM/preprocessor.py @@ -26,7 +26,7 @@ import pysam import numpy as np from collections import defaultdict -from tqdm import tqdm +#from tqdm import tqdm import logging import datetime import bisect @@ -99,10 +99,6 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth mbam=pysam.Samfile(mbam_fn, 'wb', template=in_bam) mread_set = set() - # do not omit sequences if to filter max_tags - if max_tags>0: - omit_detail=False - # splitting unique and multi- reads # and add the read taggers we need if not \ @@ -121,24 +117,28 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth if read_tag==-1: continue read.tags += [('RT', read_tag)] ## add the tag - - ## omit the details in read sequence and quality - ## recommended for larger bam because this - ## can save some memory/storage for large bams - if omit_detail: - read.query_sequence = '*' - read.query_qualities = '0' + tagged_read = pysam.AlignedSegment() + tagged_read.query_name = read.query_name + tagged_read.query_sequence = 'N' + tagged_read.flag = read.flag + tagged_read.reference_id = read.reference_id + tagged_read.reference_start = read_tag - 1 # 0-based leftmost coordinate + tagged_read.mapping_quality = read.mapping_quality + tagged_read.cigar = ((0,1),) + tagged_read.template_length = read.template_length + tagged_read.query_qualities = pysam.qualitystring_to_array("<") + tagged_read.tags = read.tags if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1): - try: - if read.opt("NH") < max_hits: - mbam.write(read) - mread_set.add(read.qname) - except KeyError: - #print read - raise Exception('%s: missing NH tag when is_secondary=%s'%(read.qname,read.is_secondary)) + #try: + if read.opt("NH") < max_hits: + mbam.write(tagged_read) + mread_set.add(read.qname) + #except KeyError: + # #print read + # raise Exception('%s: missing NH tag when is_secondary=%s'%(read.qname,read.is_secondary)) else: - ubam.write(read) + ubam.write(tagged_read) unique_counter += 1 ubam.close() @@ -146,9 +146,9 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth # sorting pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_ubam_fn), '-o', sorted_ubam_fn, ubam_fn) - os.remove(ubam_fn) + #os.remove(ubam_fn) pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_mbam_fn), '-o', sorted_mbam_fn, mbam_fn) - os.remove(mbam_fn) + #os.remove(mbam_fn) pysam.index(sorted_ubam_fn) pysam.index(sorted_mbam_fn) diff --git a/setup.py b/setup.py index 0040a0e..c73348e 100755 --- a/setup.py +++ b/setup.py @@ -5,13 +5,16 @@ def main(): setup(name='CLAM', - version='1.1.3', - description='CLIP-seq Analysis of Multi-mapped reads', - author='Zijun Zhang', - author_email='zj.z@ucla.edu', - url='https://github.com/Xinglab/CLAM', - packages=['CLAM', 'CLAM.stats'], - scripts=['bin/CLAM'] + version='1.1.4', + description='CLIP-seq Analysis of Multi-mapped reads', + author='Zijun Zhang', + author_email='zj.z@ucla.edu', + url='https://github.com/Xinglab/CLAM', + packages=['CLAM', 'CLAM.stats'], + scripts=['bin/CLAM'], + install_requires=[ + 'pysam>0.12,<0.2', + 'numpy'] ) return From 1580b48532a6deaeff6ea9fb8120479eeeae45c9 Mon Sep 17 00:00:00 2001 From: zj-zhang Date: Wed, 27 Feb 2019 01:31:39 -0800 Subject: [PATCH 2/9] add strandness (un-tested) --- CLAM/preprocessor.py | 10 ++++++++-- CLAM/realigner.py | 14 +++++++++----- bin/CLAM | 15 ++++++++++++--- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/CLAM/preprocessor.py b/CLAM/preprocessor.py index 420eb4c..f0b98e4 100755 --- a/CLAM/preprocessor.py +++ b/CLAM/preprocessor.py @@ -71,7 +71,7 @@ def read_tagger_collection(alignment, method='median', **kwargs): -def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_method, omit_detail=False): +def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_method, strandness): """Pre-processing function for cleaning up the input bam file. Args: Returns: @@ -128,6 +128,10 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth tagged_read.template_length = read.template_length tagged_read.query_qualities = pysam.qualitystring_to_array("<") tagged_read.tags = read.tags + + # add strandness check + if strandness != "none": + tagged_read.is_reverse = read.is_reverse ^ strandness!="same" if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1): #try: @@ -271,13 +275,15 @@ def parser(args): max_hits = args.max_hits ## Note: if specified max_tags, need pre-sorted bam max_tags = args.max_tags + strandness = args.strandness #logger = logging.getLogger('CLAM.Preprocessor') logger.info('start') logger.info('run info: %s'%(' '.join(sys.argv))) filter_bam_multihits(in_bam, max_hits=max_hits, max_tags=max_tags, out_dir=out_dir, - read_tagger_method=tag_method) + read_tagger_method=tag_method, + strandness=strandness) logger.info('end') except KeyboardInterrupt(): diff --git a/CLAM/realigner.py b/CLAM/realigner.py index 3f93c36..456b2c1 100755 --- a/CLAM/realigner.py +++ b/CLAM/realigner.py @@ -382,8 +382,9 @@ def get_genomic_clusters(mbam, winsize=50, unstranded=False): def realigner(in_bam, out_dir, max_hits=100, max_tags=-1, read_tagger_method='median', - winsize=50, unstranded=False, retag=False): + winsize=50, unstranded=False, retag=False, strandness="same"): """The main entry for CLAM-realigner. + Args: in_bam (str): filepath for input bam out_dir (str): filepath for CLAM output folder @@ -394,6 +395,9 @@ def realigner(in_bam, out_dir, max_hits=100, max_tags=-1, read_tagger_method='me winsize (int): window size unstranded (bool): ignore alignment strand info if turned on retag (bool): force to call `preprocessor` to process `in_bam` if turned on + strandness (str): specifies if the expected read alignment strand is `same` with + transcript strand, or `opposite`, or `none` i.e. unstranded + Returns: None """ @@ -410,7 +414,7 @@ def realigner(in_bam, out_dir, max_hits=100, max_tags=-1, read_tagger_method='me os.path.isfile(os.path.join(out_dir,'multi.sorted.bam')) \ ) : filter_bam_multihits(in_bam, max_tags=max_tags, max_hits=max_hits, out_dir=out_dir, read_tagger_method=read_tagger_method, - omit_detail=False) + strandness=strandness) else: logger.info("found existing bams; skipped tagging.") @@ -503,14 +507,14 @@ def parser(args): max_tags = args.max_tags retag = args.retag winsize = args.winsize - unstranded = args.unstranded + strandness = args.strandness + unstranded = strandness == "none" - #logger = logging.getLogger('CLAM.Realigner') logger.info('start') logger.info('run info: %s'%(' '.join(sys.argv))) realigner(in_bam, out_dir, max_hits=max_hits, max_tags=max_tags, read_tagger_method=tag_method, - winsize=winsize, unstranded=unstranded, retag=retag) + winsize=winsize, unstranded=unstranded, retag=retag, strandness=strandness) logger.info('end') except KeyboardInterrupt(): diff --git a/bin/CLAM b/bin/CLAM index a7d0f7a..07254f2 100755 --- a/bin/CLAM +++ b/bin/CLAM @@ -148,7 +148,12 @@ def add_preprocessor_parser( subparsers ): ag_prep.add_argument("--max-tags", dest="max_tags", type=int, default=-1, help="The maximum identical tags at given location; default: -1, no filter") - + + # strandness + ag_prep.add_argument("--strandness", dest="strandness", type=str, + default="same", choices=['same', 'opposite', 'none'], + help="The expected read strandness with transcription direction: same, opposite, or none(i.e. unstranded); default: same") + return @@ -180,8 +185,12 @@ def add_realigner_parser( subparsers ): ag_realigner.add_argument("--winsize", dest="winsize", type=int, default=50, help="Local window size for em computations; default: 50") - ag_realigner.add_argument("--unstranded", dest="unstranded", default=False, action="store_true", - help="Unstranded alignments if turned on") + #ag_realigner.add_argument("--unstranded", dest="unstranded", default=False, action="store_true", + # help="Unstranded alignments if turned on") + + ag_realigner.add_argument("--strandness", dest="strandness", type=str, + default="same", choices=['same', 'opposite', 'none'], + help="The expected read strandness with transcription direction: same, opposite, or none(i.e. unstranded); default: same") return From 7253af4dec5d1e824e4377e5aad584789705326f Mon Sep 17 00:00:00 2001 From: zj-zhang Date: Wed, 27 Feb 2019 01:37:31 -0800 Subject: [PATCH 3/9] fix bit op --- CLAM/preprocessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CLAM/preprocessor.py b/CLAM/preprocessor.py index f0b98e4..077cf57 100755 --- a/CLAM/preprocessor.py +++ b/CLAM/preprocessor.py @@ -131,7 +131,7 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth # add strandness check if strandness != "none": - tagged_read.is_reverse = read.is_reverse ^ strandness!="same" + tagged_read.is_reverse = (read.is_reverse) ^ (strandness!="same") if read.is_secondary or (read.has_tag('NH') and read.opt("NH")>1): #try: From b1fa120d685f3e99641e4afa4682f7477df47129 Mon Sep 17 00:00:00 2001 From: zj-zhang Date: Wed, 27 Feb 2019 01:40:29 -0800 Subject: [PATCH 4/9] remove unsorted files --- CLAM/preprocessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CLAM/preprocessor.py b/CLAM/preprocessor.py index 077cf57..7e40638 100755 --- a/CLAM/preprocessor.py +++ b/CLAM/preprocessor.py @@ -150,9 +150,9 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth # sorting pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_ubam_fn), '-o', sorted_ubam_fn, ubam_fn) - #os.remove(ubam_fn) + os.remove(ubam_fn) pysam.sort('-m', '4G', '-@', '3', '-T', os.path.dirname(sorted_mbam_fn), '-o', sorted_mbam_fn, mbam_fn) - #os.remove(mbam_fn) + os.remove(mbam_fn) pysam.index(sorted_ubam_fn) pysam.index(sorted_mbam_fn) From e87c52af388b922d680959c87f0af2fa6e02419d Mon Sep 17 00:00:00 2001 From: zj-zhang Date: Sat, 1 Jun 2019 18:49:00 -0400 Subject: [PATCH 5/9] add norm-lib --- CLAM/config.py | 8 +++ CLAM/peakcaller.py | 114 +++++++++++++++++++++++++++------ CLAM/permutation_peakcaller.py | 14 ++-- CLAM/preprocessor.py | 7 +- CLAM/realigner.py | 7 +- bin/CLAM | 10 +-- check/compare_realign.py | 46 +++++++++++++ requirements.txt | 4 +- setup.py | 4 +- 9 files changed, 173 insertions(+), 41 deletions(-) create mode 100644 CLAM/config.py create mode 100644 check/compare_realign.py diff --git a/CLAM/config.py b/CLAM/config.py new file mode 100644 index 0000000..dcc32a5 --- /dev/null +++ b/CLAM/config.py @@ -0,0 +1,8 @@ +# -*- coding: UTF-8 -*- + +""" General Version and other info +""" + +__version__ = '1.2.0-beta' +__author__ = 'Zijun Zhang' +__email__ = 'zj.z@ucla.edu' \ No newline at end of file diff --git a/CLAM/peakcaller.py b/CLAM/peakcaller.py index ff5e009..8c0680c 100755 --- a/CLAM/peakcaller.py +++ b/CLAM/peakcaller.py @@ -25,9 +25,8 @@ Tested under python 2.7.3 """ -__author__ = 'Zijun Zhang' -__version__ = '1.1.3' -__email__ = 'zj.z@ucla.edu' +from . import config +__version__ = config.__version__ import os import sys @@ -128,7 +127,8 @@ def bin_interval_counts(interval, binsize=50): return bins -def test_bin_negbinom(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh'): +def test_bin_negbinom(intv_bin_ip, intv_bin_con, with_control=True, correction_method='fdr_bh', + norm_lib=False, tot_count_dict=None): """DOCSTRING Args Returns @@ -184,20 +184,33 @@ def _neg_loglik_constrain(par, data): binsignal = np.empty(intv_counter, dtype='S50') alpha_ip_vec = np.empty(intv_bin_ip.shape[0]) alpha_con_vec = np.empty(intv_bin_con.shape[0]) - ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip) - con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con) - if any(ip_sum==0) or any(con_sum==0): - return None, None, None + if norm_lib and tot_count_dict is not None: + try: + ip_sum = np.array(tot_count_dict['ubam.ip']) + np.array(tot_count_dict['mbam.ip'] ) + except KeyError: + ip_sum = np.array(tot_count_dict['ubam.ip']) + try: + con_sum = np.array(tot_count_dict['ubam.con']) + np.array(tot_count_dict['mbam.con']) + except KeyError: + con_sum = np.array(tot_count_dict['ubam.con']) + else: + ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip) + con_sum = np.apply_along_axis(np.sum, 1, intv_bin_con) + + ## TODO: this avoids the error "DivisionByZero" when any of the IP/con is zero.. + ## comment out because add pseudo-count.. ZZJ 2019.5.31 + #if any(ip_sum==0) or any(con_sum==0): + # return None, None, None # compute the dispersion parameters min_alpha = 0.001 ## small alpha reduces to Poisson max_alpha = 0.01 ## large alpha costs loss of power if with_control: for i in range(intv_bin_con.shape[0]): - height = ztnb_em.collapse_data(np.floor(intv_bin_con[i,])) - height[0] = 0 try: ll, mu, alpha = ztnb_em.EM_estim_params(height, max_iter=10, verbose=False) + height = ztnb_em.collapse_data(np.floor(intv_bin_con[i,])) + height[0] = 0 except: ## means truncated loglik=0, not enough data alpha = max_alpha alpha = max_alpha if alpha>max_alpha else alpha @@ -239,7 +252,7 @@ def _neg_loglik_constrain(par, data): this_ip = intv_bin_ip[:, i] others_ip = ip_sum - this_ip this_con = intv_bin_con[:, i] - others_con = con_sum - this_con + others_con = con_sum - intv_bin_con[:, i] if np.sum(this_ip) == 0: binsignal[i], binscore[i] = 'nan', np.nan continue @@ -280,13 +293,15 @@ def _neg_loglik_constrain(par, data): binsignal[i] = ','.join([str(int(x)) if x==int(x) else str(x) for x in this_ip]) # correcting for multiple-testing + if np.sum(np.isnan(binscore))==len(binscore): + return None, None, None adj = multipletests(binscore[~ np.isnan(binscore)], alpha=0.05, method=correction_method) binscore_adj = np.copy(binscore) binscore_adj[ ~ np.isnan(binscore) ] = adj[1] return binsignal, binscore_adj, binscore -def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsize=50, unstranded=False, qval_cutoff=0.05, fold_change=[2.], min_clip_cov=0, pooling=False): +def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsize=50, unstranded=False, qval_cutoff=0.05, fold_change=[2.], min_clip_cov=0, pooling=False, norm_lib=False, tot_count_dict=None): """DOCSTRING Args Returns @@ -309,7 +324,7 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz ip_sum = np.apply_along_axis(np.sum, 1, interval_ip) valid_ip_sample = np.where(ip_sum > min_clip_cov)[0] if len(valid_ip_sample)==0: - #print "no reads" + #print("no reads") return '' interval_ip = interval_ip[valid_ip_sample,:] @@ -343,7 +358,9 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz intv_bin_con = bin_interval_counts(interval_con, binsize=binsize) # perform statistical test - signal_val, binscore_adj, binscore = test_bin_negbinom(intv_bin_ip, intv_bin_con, with_control=with_control) + signal_val, binscore_adj, binscore = test_bin_negbinom( + intv_bin_ip, intv_bin_con, with_control=with_control, + norm_lib=norm_lib, tot_count_dict=tot_count_dict) # DO NOT USE #if with_control or intv_bin_ip.shape[0]>1: @@ -384,7 +401,7 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz return BED -def _child_peak_caller( (ip_bam_list, con_bam_list, child_gene_list, gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling) ): +def _child_peak_caller( (ip_bam_list, con_bam_list, child_gene_list, gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling, norm_lib, tot_count_dict) ): """DOCSTRING Args Returns @@ -404,12 +421,66 @@ def _child_peak_caller( (ip_bam_list, con_bam_list, child_gene_list, gene_annot, BED += call_gene_peak(bam_dict, gene, unique_only=unique_only, with_control=with_control, unstranded=unstranded, binsize=binsize, qval_cutoff=qval_cutoff, fold_change=fold_change, - min_clip_cov=min_clip_cov, pooling=pooling) + min_clip_cov=min_clip_cov, pooling=pooling, + norm_lib=norm_lib, tot_count_dict=tot_count_dict) # close the handler _ = map(lambda x: x.close(), [bam for x in bam_dict.values() for bam in x]) return BED +def get_bam_total_reads(ip_bam_list, con_bam_list): + tot_count_dict = defaultdict(list) + try: + ubam_ip, mbam_ip = ip_bam_list + except: + ubam_ip = ip_bam_list[0] + mbam_ip = None + + for bam_fn in ubam_ip.split(','): + if not os.path.isfile(bam_fn): + raise Exception('%s not found'%bam_fn) + tot_count_dict['ubam.ip'].append( get_total_reads(bam_fn) ) + + if mbam_ip is not None: + for bam_fn in mbam_ip.split(','): + if not os.path.isfile(bam_fn): + raise Exception('%s not found'%bam_fn) + tot_count_dict['mbam.ip'].append( get_total_reads(bam_fn) ) + + if con_bam_list is None: + return tot_count_dict + + try: + ubam_con, mbam_con = con_bam_list + except: + ubam_con = con_bam_list[0] + mbam_con = None + + for bam_fn in ubam_con.split(','): + if not os.path.isfile(bam_fn): + raise Exception('%s not found'%bam_fn) + tot_count_dict['ubam.con'].append( get_total_reads(bam_fn) ) + + if mbam_con is not None: + for bam_fn in mbam_con.split(','): + if not os.path.isfile(bam_fn): + raise Exception('%s not found'%bam_fn) + tot_count_dict['mbam.con'].append( get_total_reads(bam_fn) ) + + return tot_count_dict + + +def get_total_reads(bam_filename): + idxstats = pysam.idxstats(bam_filename).split('\n') + tot = 0 + for l in idxstats: + if not l: + continue + ele = l.split('\t') + tot += int(ele[-2]) + return tot + + def make_bam_handler_dict(ip_bam_list, con_bam_list): """DOCSTRING Args @@ -452,6 +523,7 @@ def make_bam_handler_dict(ip_bam_list, con_bam_list): def peakcaller(ip_bam_list, gtf_fp, con_bam_list=None, nthread=8, out_dir='.', binsize=50, + norm_lib=False, unique_only=False, unstranded=True, qval_cutoff=0.05, fold_change=[2.], min_clip_cov=0, pooling=False): @@ -486,6 +558,9 @@ def peakcaller(ip_bam_list, gtf_fp, con_bam_list=None, nthread=8, # read in GTF gene_annot = read_gtf(gtf_fp) + + # read total library read number + tot_count_dict = get_bam_total_reads(ip_bam_list, con_bam_list) if nthread == 1: # iteratively call peaks in each gene @@ -495,7 +570,8 @@ def peakcaller(ip_bam_list, gtf_fp, con_bam_list=None, nthread=8, BED = call_gene_peak(bam_dict, gene, unique_only=unique_only, with_control=with_control, unstranded=unstranded, binsize=binsize, qval_cutoff=qval_cutoff, fold_change=fold_change, - min_clip_cov=min_clip_cov, pooling=pooling) + min_clip_cov=min_clip_cov, pooling=pooling, + norm_lib=norm_lib, tot_count_dict=tot_count_dict) ofile.write(BED) #print BED peak_counter += len(BED.split('\n'))-1 @@ -511,7 +587,7 @@ def peakcaller(ip_bam_list, gtf_fp, con_bam_list=None, nthread=8, pool=Pool(processes=nthread) BED_list = pool.map( _child_peak_caller, - [(ip_bam_list, con_bam_list, child_gene_list[i], gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling) for i in range(nthread)] + [(ip_bam_list, con_bam_list, child_gene_list[i], gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling, norm_lib, tot_count_dict) for i in range(nthread)] ) pool.terminate() pool.join() @@ -551,6 +627,7 @@ def parser(args): binsize = args.binsize qval_cutoff = args.qval_cutoff fold_change = args.fold_change + norm_lib = args.norm_lib min_clip_cov = args.min_clip_cov pooling = args.pooling logger = logging.getLogger('CLAM.Peakcaller') @@ -559,6 +636,7 @@ def parser(args): peakcaller(ip_bam_list, gtf_fp, con_bam_list, nthread, out_dir=out_dir, binsize=binsize, + norm_lib=norm_lib, unique_only=unique_only, unstranded=unstranded, qval_cutoff=qval_cutoff, fold_change=fold_change, diff --git a/CLAM/permutation_peakcaller.py b/CLAM/permutation_peakcaller.py index 94b8533..66575d7 100755 --- a/CLAM/permutation_peakcaller.py +++ b/CLAM/permutation_peakcaller.py @@ -17,9 +17,8 @@ Tested under python 2.7.3 """ -__author__ = 'Zijun Zhang' -__version__ = '1.1.3' -__email__ = 'zj.z@ucla.edu' +from . import config +__version__ = config.__version__ import os @@ -466,11 +465,10 @@ def read_tid_frag_from_bam(tid, bamfile, is_stranded, is_unique): continue except: continue - #try: - # read_length = read.qlen - #except: - # read_length = read.positions[-1] - read.positions[0] + 1 - read_length = read.positions[-1] - read.positions[0] + 1 + try: + read_length = read.opt('RL') + except: + read_length = read.positions[-1] - read.positions[0] + 1 if (not 'N' in read.cigarstring) and \ (read.pos-start>=0 and read_length<500): # to avoid junction reads diff --git a/CLAM/preprocessor.py b/CLAM/preprocessor.py index 7e40638..503c2c7 100755 --- a/CLAM/preprocessor.py +++ b/CLAM/preprocessor.py @@ -20,6 +20,8 @@ Tested under python 2.7 """ +from . import config +__version__ = config.__version__ import os import sys @@ -33,9 +35,6 @@ import argparse as ap import inspect -__author__ = 'Zijun Zhang' -__version__ = '1.1.3' -__email__ = 'zj.z@ucla.edu' logger = logging.getLogger('CLAM.Preprocessor') @@ -128,6 +127,8 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth tagged_read.template_length = read.template_length tagged_read.query_qualities = pysam.qualitystring_to_array("<") tagged_read.tags = read.tags + read_len = sum([i[1] for i in read.cigar if i[0]==0]) + tagged_read.tags += [('RL', read_len)] # add strandness check if strandness != "none": diff --git a/CLAM/realigner.py b/CLAM/realigner.py index 456b2c1..c665bed 100755 --- a/CLAM/realigner.py +++ b/CLAM/realigner.py @@ -18,16 +18,15 @@ Tested under python 2.7 """ -__author__ = 'Zijun Zhang' -__version__ = '1.1.3' -__email__ = 'zj.z@ucla.edu' +from . import config +__version__ = config.__version__ import os import sys import pysam import numpy as np from collections import defaultdict -from tqdm import tqdm +#from tqdm import tqdm import logging import datetime import bisect diff --git a/bin/CLAM b/bin/CLAM index 07254f2..790ddd6 100755 --- a/bin/CLAM +++ b/bin/CLAM @@ -30,10 +30,7 @@ Foundation, either version 3 of the License, or (at your option) any later version """ -__author__ = 'Zijun Zhang' -__version__ = '1.1.3' -__email__ = 'zj.z@ucla.edu' - +from CLAM import config import os import sys import logging @@ -109,7 +106,7 @@ def get_arg_parser(): epilog = "For command line options of each sub-command, type: %(prog)s COMMAND -h" argparser = ap.ArgumentParser(description=description, epilog=epilog) - argparser.add_argument("--version", action="version", version="%(prog)s "+__version__) + argparser.add_argument("--version", action="version", version="%(prog)s "+config.__version__) subparsers = argparser.add_subparsers( dest="subcommand" ) @@ -230,6 +227,9 @@ def add_peakcaller_parser( subparsers ): ag_peakcaller.add_argument("--fold-change", dest="fold_change", nargs='+', type=float, default=[2.], help="Threasholds for signal range (fold change w/ control; tag count w/o control); default: 2-inf") + + ag_peakcaller.add_argument("--normalize-lib", dest="norm_lib", action="store_true", default=False, + help="use total library size to normalize signal and control, instead of gene-by-gene basis; default: False") ag_peakcaller.add_argument("-b", "--binsize", dest="binsize", type=int, default=50, help="Bin size for calling peaks; default: 50") diff --git a/check/compare_realign.py b/check/compare_realign.py new file mode 100644 index 0000000..188194e --- /dev/null +++ b/check/compare_realign.py @@ -0,0 +1,46 @@ +"""Compare the realigner outputs for different +version +ZZJ +2019.5.27 +""" + +import sys +import pysam + + + +def read_as_score(bfile): + s1 = {} + with pysam.Samfile(bfile, 'rb') as bam: + i = 0 + for r1 in bam: + i += 1 + #if i>30000: + # break + s1[(r1.qname, r1.rname, r1.pos)] = r1.opt('AS') + return s1 + + +def plot_scatter(new, old): + import matplotlib.pyplot as plt + import seaborn as sns + ax = sns.jointplot(new, old, kind="reg") + ax.set_axis_labels('New', 'Old') + plt.savefig('realign_check.png') + +def compare(): + s1 = read_as_score('new_out/realigned.sorted.bam') + s2 = read_as_score('old_out/realigned.sorted.bam') + k = list([x for x in s1 if x in s2]) + old = [] + new = [] + print("ID\tnew\told\n") + for k_ in k: + print("%s\t%s\t%s\n"%(k_, s1[k_], s2[k_] ) ) + new.append(s1[k_]) + old.append(s2[k_]) + plot_scatter(new, old) + + +if __name__ == '__main__': + compare() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1188a67..5594479 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ pysam>=0.9.0 multiprocessing statsmodels -tqdm \ No newline at end of file +tqdm +pybedtools +mpmath \ No newline at end of file diff --git a/setup.py b/setup.py index c73348e..d67593c 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ def main(): setup(name='CLAM', - version='1.1.4', + version='1.2.0-beta', description='CLIP-seq Analysis of Multi-mapped reads', author='Zijun Zhang', author_email='zj.z@ucla.edu', @@ -13,7 +13,7 @@ def main(): packages=['CLAM', 'CLAM.stats'], scripts=['bin/CLAM'], install_requires=[ - 'pysam>0.12,<0.2', + #'pysam>0.12,<0.2', 'numpy'] ) return From 15854316716d84fcab2d19761af1d75ba9d7e329 Mon Sep 17 00:00:00 2001 From: wkdeng Date: Mon, 17 Jun 2019 17:31:09 -0400 Subject: [PATCH 6/9] Added peak annotator and genome data downloader --- CLAM/download_data.py | 71 +++++++ CLAM/peak_annotator.py | 145 +++++++++++++ bin/CLAM | 467 ++++++++++++++++++++++------------------- 3 files changed, 472 insertions(+), 211 deletions(-) create mode 100644 CLAM/download_data.py create mode 100644 CLAM/peak_annotator.py diff --git a/CLAM/download_data.py b/CLAM/download_data.py new file mode 100644 index 0000000..e510ce8 --- /dev/null +++ b/CLAM/download_data.py @@ -0,0 +1,71 @@ +import os +import sys +import subprocess +import peak_annotator + + +def parser(args): + """DOCSTRING + Args + Returns + """ + try: + genome = args.genome + download_genome(genome) + except KeyboardInterrupt(): + sys.exit(0) + + +def download_genome(genome): + curr_dir = os.path.abspath('.') + + admin = (os.getuid() == 0) + cmd = [] + home = os.environ['HOME'] + if admin: + profile = '/etc/profile' + else: + profile = '{home}/.bashrc'.format(home=home) + + if not os.path.isdir('{home}/.clam_data'.format(home=home)): + os.mkdir('{home}/.clam_data'.format(home=home)) + os.chdir('{home}/.clam_data'.format(home=home)) + + if 'CLAM_DAT' not in os.environ or not os.environ['CLAM_DAT'] == '{home}/.clam_data'.format(home=home): + cmd.append('echo "export CLAM_DAT=\'{clam_data}\'" >> {profile}'.format( + clam_data=os.path.abspath('.'), profile=profile)) + cmd.append('source {profile}'.format(profile=profile)) + os.environ['CLAM_DAT'] = os.path.abspath('.') + + if not check_genome_data(genome): + cmd.append('chmod -R 755 {home}/.clam_data'.format(home=home)) + cmd.append( + 'wget https://raw.githubusercontent.com/wkdeng/clam_data/master/{genome}.zip'.format(genome=genome)) + cmd.append('unzip -o {genome}.zip'.format(genome=genome)) + cmd.append('rm {genome}.zip'.format(genome=genome)) + for item in cmd: + subprocess.call(item, shell=True, executable='/bin/bash') + print 'Download finished' + os.chdir(curr_dir) + +def check_genome_data(genome): + if not os.path.isdir(os.environ['CLAM_DAT'] + '/' + genome): + return False + if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/3UTRs.bed'): + return False + if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/5UTRs.bed'): + return False + if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/cds.bed'): + return False + if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/exons.bed'): + return False + if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/introns.bed'): + return False + if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal200_intron.bed'): + return False + if not os.path.exists(os.environ['CLAM_DAT'] + '/' + genome + '/proximal500_intron.bed'): + return False + return True + +if __name__ == '__main__': + download_genome('hg38') diff --git a/CLAM/peak_annotator.py b/CLAM/peak_annotator.py new file mode 100644 index 0000000..8d5bf36 --- /dev/null +++ b/CLAM/peak_annotator.py @@ -0,0 +1,145 @@ +import sys +import os +import pybedtools +import argparse as ap +import logging +import download_data + +''' +Assign peaks to genomic regions +Zijun Zhang +8.1.2018 +10.25.2018: wrapped to a function with document + +DWK +modified to output annotation file +6.12.2019 +''' + +# pylint: disable-msg=too-many-function-args +# pylint: disable-msg=unexpected-keyword-arg + + +def parser(args): + """DOCSTRING + Args + Returns + """ + try: + peak_in = args.peak_in + genome = args.genome + out_file = args.out_file + if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome): + print "Unable to locate CLAM data folder for genomic regions, will try to download." + print "Downloading..." + download_data.download_genome(genome) + genome_data = os.environ['CLAM_DAT'] + intersect_gtf_regions( + peak_in, out_file, os.path.join(genome_data, genome)) + except KeyboardInterrupt(): + sys.exit(0) + + +def intersect_gtf_regions(peak_fp, outfn, gtf_dir): + '''function: intersect_gtf_regions(peak_fp, outfn, gtf_dir) + Intersect a peak BED file with a list of genomic region annotations (e.g. start/stop codon, UTR, intron), + output the peak-region annotations. + :param peak_fp: filepath to a BED-format peakquit + :param outfn: filepath to output count file, has to end with ".txt"; annotation will be "NNN.annot.txt" + + ''' + # input arguments + + # make pybedtools objects + print "Loading peaks..." + peaks = pybedtools.BedTool(peak_fp) + print "Peak file loaded." + print "Loading genome annotation..." + ref_dict = { + 'exon': pybedtools.BedTool(os.path.join(gtf_dir, 'exons.bed')), + '3UTR': pybedtools.BedTool(os.path.join(gtf_dir, '3UTRs.bed')), + '5UTR': pybedtools.BedTool(os.path.join(gtf_dir, '5UTRs.bed')), + 'cds': pybedtools.BedTool(os.path.join(gtf_dir, 'cds.bed')), + 'intron': pybedtools.BedTool(os.path.join(gtf_dir, 'introns.bed')), + 'proximal200': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal200_intron.bed')), + 'proximal500': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal500_intron.bed')) + } + print "Genome annotation loaded." + + # # process reference for use + target = { + "3UTR": ref_dict['3UTR'], + "5UTR": ref_dict['5UTR'], + "CDS": ref_dict['cds'], + "other_exon": ref_dict['exon']-ref_dict['3UTR']-ref_dict['5UTR']-ref_dict['cds'], + "px200_intron": ref_dict['proximal200'], + "px500_intron": ref_dict['proximal500'].subtract(ref_dict['proximal200']), + "distal_intron": ref_dict['intron'].subtract(ref_dict['exon']).subtract(ref_dict['proximal500']) + } + category_list = ['3UTR', '5UTR', 'CDS', + 'other_exon', "px200_intron", "px500_intron", "distal_intron"] + init = True + + print "Intersecting peaks with genome annotation..." + for cat in category_list: + bed_arr = [] + for interval in target[cat]: + bed_arr.append('\t'.join([str(x) for x in interval.fields])) + bed_arr[-1] = bed_arr[-1] + '\t' + cat + bed_arr = list(dict.fromkeys(bed_arr)) + for i in range(len(bed_arr)): + bed_arr[i] = bed_arr[i].split('\t') + target[cat] = pybedtools.BedTool(bed_arr) + + if init: + init = False + result_bed = peaks.intersect(target[cat], wa=True, wb=True) + else: + result_bed = result_bed.cat(peaks.intersect( + target[cat], wa=True, wb=True), postmerge=False) + result_bed = result_bed.sort() + + print "Preparing output..." + result_bed.saveas(outfn + '_') + prepend = ['## Annotation peaks to genomic regions, all intersected genomic regions are presented.', + '## CLAM version: 1.2.0', + '## Column 1: Peak chromosome', + '## Column 2: Peak start', + '## Column 3: Peak end', + '## Column 4: Peak name', + '## Column 5: Peak score', + '## Column 6: Peak strand', + '## Column 7: Peak signal value', + '## Column 8: Peak pValue', + '## Column 9: Peak qValue', + '## Column 10: Point-source called for this peak', + '## Column 11: Genomic region chromosome', + '## Column 12: Genomic region start', + '## Column 13: Genomic region end', + '## Column 14: Gene ID', + '## Column 15: Quality score', + '## Column 16: Genomic region strand', + '## Column 17: Genomic region type'] + if os.path.exists(outfn): + os.remove(outfn) + for line in prepend: + cmd = 'echo "{prepend}" >> {outfn}'.format( + prepend=line, outfn=outfn) + os.system(cmd) + os.system('cat {outtmp} >> {outfn}'.format( + outtmp=outfn + '_', outfn=outfn)) + os.remove(outfn+'_') + print "DONE" + + +if __name__ == '__main__': + # peak_fp, genome, outfn = sys.argv[1], sys.argv[2], sys.argv[3] + os.chdir('/mnt/h/yi_lab/m6a/src/scripts/peakComposition') + peak_in, genome, out_file = 'narrow_peak.unique.bed', 'mm10', 'annotate_peak.bed' + if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome): + print "Unable to find CLAM data folder for genomic regions, please try to download it using download_genome command." + print "Downloading..." + download_data.download_genome(genome) + genome_data = os.environ['CLAM_DAT'] + intersect_gtf_regions( + peak_in, out_file, os.path.join(genome_data, genome)) diff --git a/bin/CLAM b/bin/CLAM index a7d0f7a..1deca1c 100755 --- a/bin/CLAM +++ b/bin/CLAM @@ -42,235 +42,280 @@ import datetime def main(): - """main entry for CLAM - This function setup the logging and handle the input options - Args - None - Returns - None - """ - logger = setup_logger() - argparser = get_arg_parser() - args = argparser.parse_args() - - subcommand = args.subcommand - - if subcommand == 'preprocessor': - from CLAM import preprocessor - preprocessor.parser( args ) - elif subcommand == 'realigner': - from CLAM import realigner - #print args - realigner.parser( args ) - - elif subcommand == 'peakcaller': - from CLAM import peakcaller - #print args - peakcaller.parser( args ) - elif subcommand == 'permutation_callpeak': - from CLAM import permutation_peakcaller - permutation_peakcaller.parser( args ) + """main entry for CLAM + This function setup the logging and handle the input options + Args + None + Returns + None + """ + logger = setup_logger() + argparser = get_arg_parser() + args = argparser.parse_args() + + subcommand = args.subcommand + + if subcommand == 'preprocessor': + from CLAM import preprocessor + preprocessor.parser(args) + elif subcommand == 'realigner': + from CLAM import realigner + #print args + realigner.parser(args) + + elif subcommand == 'peakcaller': + from CLAM import peakcaller + #print args + peakcaller.parser(args) + elif subcommand == 'permutation_callpeak': + from CLAM import permutation_peakcaller + permutation_peakcaller.parser(args) + elif subcommand == 'peak_annotator': + from CLAM import peak_annotator + peak_annotator.parser(args) + elif subcommand == 'data_downloader': + from CLAM import download_data + download_data.parser(args) def setup_logger(): - """Set up the logger for the whole pipeline - Args - None - Returns - logger: logging object - """ - # setup logger - logger = logging.getLogger('CLAM') - logger.setLevel(logging.DEBUG) - # create file handler which logs even debug messages - #fh = logging.FileHandler( - # 'log.CLAM.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.txt') - fh = logging.FileHandler('log.CLAM.txt') - fh.setLevel(logging.INFO) - # create console handler with a higher log level - ch = logging.StreamHandler() - ch.setLevel(logging.DEBUG) - # create formatter and add it to the handlers - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s -\n %(message)s') - fh.setFormatter(formatter) - ch.setFormatter(formatter) - # add the handlers to the logger - logger.addHandler(fh) - logger.addHandler(ch) - return logger + """Set up the logger for the whole pipeline + Args + None + Returns + logger: logging object + """ + # setup logger + logger = logging.getLogger('CLAM') + logger.setLevel(logging.DEBUG) + # create file handler which logs even debug messages + # fh = logging.FileHandler( + # 'log.CLAM.'+'-'.join(str(datetime.datetime.now()).replace(':','-').split()) + '.txt') + fh = logging.FileHandler('log.CLAM.txt') + fh.setLevel(logging.INFO) + # create console handler with a higher log level + ch = logging.StreamHandler() + ch.setLevel(logging.DEBUG) + # create formatter and add it to the handlers + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s -\n %(message)s') + fh.setFormatter(formatter) + ch.setFormatter(formatter) + # add the handlers to the logger + logger.addHandler(fh) + logger.addHandler(ch) + return logger def get_arg_parser(): - """DOCSTRING - Args - Returns - """ - description = "%(prog)s -- CLip-seq Analysis of Multi-mapped reads" - epilog = "For command line options of each sub-command, type: %(prog)s COMMAND -h" - - argparser = ap.ArgumentParser(description=description, epilog=epilog) - argparser.add_argument("--version", action="version", version="%(prog)s "+__version__) - - subparsers = argparser.add_subparsers( dest="subcommand" ) - - # preprocessing - add_preprocessor_parser(subparsers) - - # realigner - add_realigner_parser(subparsers) - - # peakcaller - add_peakcaller_parser(subparsers) - - # permutation_callpeak - add_permutation_callpeak_parser(subparsers) - - return argparser + """DOCSTRING + Args + Returns + """ + description = "%(prog)s -- CLip-seq Analysis of Multi-mapped reads" + epilog = "For command line options of each sub-command, type: %(prog)s COMMAND -h" + argparser = ap.ArgumentParser(description=description, epilog=epilog) + argparser.add_argument("--version", action="version", + version="%(prog)s "+__version__) -def add_preprocessor_parser( subparsers ): - ag_prep = subparsers.add_parser("preprocessor", help="CLAM Preprocessor: tag read alignments to specific locus") - - # input/output - ag_prep.add_argument("-i", "--input", dest="in_bam", type=str, required=True, - help="Input bam file") - - ag_prep.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, - help="Output folder") - - # processing - ag_prep.add_argument("--read-tagger-method", dest="tag_method", type=str, - choices= ('median', 'start'), default='median', - help="Read tagger method, 'median' for read center, 'start' for read start site; default: median") - - ag_prep.add_argument("--max-multihits", dest="max_hits", type=int, default=100, - help="The maximum hits allowed for multi-mapped reads; default: 100") - - ag_prep.add_argument("--max-tags", dest="max_tags", type=int, default=-1, - help="The maximum identical tags at given location; default: -1, no filter") - - return + subparsers = argparser.add_subparsers(dest="subcommand") + # preprocessing + add_preprocessor_parser(subparsers) -def add_realigner_parser( subparsers ): - ag_realigner = subparsers.add_parser("realigner", help="CLAM Realigner: realign multi-mapped reads using expectation-maximization") - - # input/output - ag_realigner.add_argument("-i", "--input", dest="in_bam", type=str, required=True, - help="Input bam file") - - ag_realigner.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, - help="Output folder") - - # processing - ag_realigner.add_argument("--read-tagger-method", dest="tag_method", type=str, - choices= ('median', 'start'), default='median', - help="Read tagger method, 'median' for read center, 'start' for read start site; default: median") - - ag_realigner.add_argument("--max-multihits", dest="max_hits", type=int, default=100, - help="The maximum hits allowed for multi-mapped reads; default: 100") + # realigner + add_realigner_parser(subparsers) - ag_realigner.add_argument("--max-tags", dest="max_tags", type=int, default=-1, - help="The maximum identical tags at given location; default: -1, no filter") + # peakcaller + add_peakcaller_parser(subparsers) - ag_realigner.add_argument("--retag", dest="retag", default=False, action='store_true', - help="Retag the bam regardless when turned on; invalid when no previous files found") - - # realign - ag_realigner.add_argument("--winsize", dest="winsize", type=int, default=50, - help="Local window size for em computations; default: 50") + # permutation_callpeak + add_permutation_callpeak_parser(subparsers) - ag_realigner.add_argument("--unstranded", dest="unstranded", default=False, action="store_true", - help="Unstranded alignments if turned on") - - return + # peak_annotator + add_peak_annotator_parser(subparsers) + # data_downloader + add_data_downloader_parser(subparsers) -def add_peakcaller_parser( subparsers ): - ag_peakcaller = subparsers.add_parser("peakcaller", help="CLAM Peakcaller: negative binomial model-based peak calling combining unique- and multi-reads") - - # input/output - ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True, - help="Filepaths for IP bam files, e.g ubam1,ubam2 mbam1,mbam2") - - ag_peakcaller.add_argument("-c", "--control-dir", dest="con_bam", nargs='+', type=str, required=True, - help="Filepaths for control bam files") - - ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, - help="Output folder") - - ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True, - help="GTF filepath") - - # processing - ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8, - help="Number of threads; default: 8") - - ag_peakcaller.add_argument("-u", "--unique-only", dest="unique_only", default=False, action='store_true', - help="Call peaks using only unique-mapped reads when turned on") - - ag_peakcaller.add_argument("--pool", dest="pooling", default=False, action="store_true", - help="Pool the read counts if provided with multiple replicates; default: False") - - ag_peakcaller.add_argument("--min-clip-cov", dest="min_clip_cov", type=int, default=4, - help="Minimum CLIP reads per gene to perform analysis; default: 4") - - # callpeak - ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.05, - help="Cutoff for adjusted p-values; default: 0.05") - - ag_peakcaller.add_argument("--fold-change", dest="fold_change", nargs='+', type=float, default=[2.], - help="Threasholds for signal range (fold change w/ control; tag count w/o control); default: 2-inf") - - ag_peakcaller.add_argument("-b", "--binsize", dest="binsize", type=int, default=50, - help="Bin size for calling peaks; default: 50") - - ag_peakcaller.add_argument("--unstranded", dest="unstranded", action="store_true", default=False, - help="Unstranded alignments if turned on") - - return + return argparser +def add_preprocessor_parser(subparsers): + ag_prep = subparsers.add_parser( + "preprocessor", help="CLAM Preprocessor: tag read alignments to specific locus") -def add_permutation_callpeak_parser( subparsers ): - ag_peakcaller = subparsers.add_parser("permutation_callpeak", help="CLAM permutation peakcaller: call peaks using permutation (as in v1.0.0)") - - # input/output - ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True, - help="Filepaths for CLIP bam, e.g ubam mbam") - - ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, - help="Output folder") - - ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True, - help="GTF filepath") - - # processing - ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8, - help="Number of threads; default: 8") - - ag_peakcaller.add_argument("--random-state", dest="random_state", type=int, default=777, - help="Seed for random number generator in permutations; default: 777") - - # callpeak - ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.005, - help="Cutoff for adjusted p-values; default: 0.005") - - ag_peakcaller.add_argument("--merge-size", dest="merge_size", type=int, default=50, - help="Select best peak within this size; default: 50") - - ag_peakcaller.add_argument("--extend", dest="extend", type=int, default=50, - help="Extend peak to this size if less than it; default: 50") - - ag_peakcaller.add_argument("--unstranded", dest="unstranded", action="store_true", default=False, - help="Unstranded alignments if turned on") - - return + # input/output + ag_prep.add_argument("-i", "--input", dest="in_bam", type=str, required=True, + help="Input bam file") + + ag_prep.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, + help="Output folder") + + # processing + ag_prep.add_argument("--read-tagger-method", dest="tag_method", type=str, + choices=('median', 'start'), default='median', + help="Read tagger method, 'median' for read center, 'start' for read start site; default: median") + + ag_prep.add_argument("--max-multihits", dest="max_hits", type=int, default=100, + help="The maximum hits allowed for multi-mapped reads; default: 100") + + ag_prep.add_argument("--max-tags", dest="max_tags", type=int, default=-1, + help="The maximum identical tags at given location; default: -1, no filter") + + return + + +def add_realigner_parser(subparsers): + ag_realigner = subparsers.add_parser( + "realigner", help="CLAM Realigner: realign multi-mapped reads using expectation-maximization") + + # input/output + ag_realigner.add_argument("-i", "--input", dest="in_bam", type=str, required=True, + help="Input bam file") + + ag_realigner.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, + help="Output folder") + + # processing + ag_realigner.add_argument("--read-tagger-method", dest="tag_method", type=str, + choices=('median', 'start'), default='median', + help="Read tagger method, 'median' for read center, 'start' for read start site; default: median") + + ag_realigner.add_argument("--max-multihits", dest="max_hits", type=int, default=100, + help="The maximum hits allowed for multi-mapped reads; default: 100") + + ag_realigner.add_argument("--max-tags", dest="max_tags", type=int, default=-1, + help="The maximum identical tags at given location; default: -1, no filter") + + ag_realigner.add_argument("--retag", dest="retag", default=False, action='store_true', + help="Retag the bam regardless when turned on; invalid when no previous files found") + + # realign + ag_realigner.add_argument("--winsize", dest="winsize", type=int, default=50, + help="Local window size for em computations; default: 50") + + ag_realigner.add_argument("--unstranded", dest="unstranded", default=False, action="store_true", + help="Unstranded alignments if turned on") + + return + + +def add_peakcaller_parser(subparsers): + ag_peakcaller = subparsers.add_parser( + "peakcaller", help="CLAM Peakcaller: negative binomial model-based peak calling combining unique- and multi-reads") + + # input/output + ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True, + help="Filepaths for IP bam files, e.g ubam1,ubam2 mbam1,mbam2") + + ag_peakcaller.add_argument("-c", "--control-dir", dest="con_bam", nargs='+', type=str, required=True, + help="Filepaths for control bam files") + + ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, + help="Output folder") + + ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True, + help="GTF filepath") + + # processing + ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8, + help="Number of threads; default: 8") + + ag_peakcaller.add_argument("-u", "--unique-only", dest="unique_only", default=False, action='store_true', + help="Call peaks using only unique-mapped reads when turned on") + + ag_peakcaller.add_argument("--pool", dest="pooling", default=False, action="store_true", + help="Pool the read counts if provided with multiple replicates; default: False") + + ag_peakcaller.add_argument("--min-clip-cov", dest="min_clip_cov", type=int, default=4, + help="Minimum CLIP reads per gene to perform analysis; default: 4") + + # callpeak + ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.05, + help="Cutoff for adjusted p-values; default: 0.05") + + ag_peakcaller.add_argument("--fold-change", dest="fold_change", nargs='+', type=float, default=[2.], + help="Threasholds for signal range (fold change w/ control; tag count w/o control); default: 2-inf") + + ag_peakcaller.add_argument("-b", "--binsize", dest="binsize", type=int, default=50, + help="Bin size for calling peaks; default: 50") + + ag_peakcaller.add_argument("--unstranded", dest="unstranded", action="store_true", default=False, + help="Unstranded alignments if turned on") + + return + + +def add_permutation_callpeak_parser(subparsers): + ag_peakcaller = subparsers.add_parser( + "permutation_callpeak", help="CLAM permutation peakcaller: call peaks using permutation (as in v1.0.0)") + + # input/output + ag_peakcaller.add_argument("-i", "--input", dest="in_bam", nargs='+', type=str, required=True, + help="Filepaths for CLIP bam, e.g ubam mbam") + + ag_peakcaller.add_argument("-o", "--out-dir", dest="out_dir", type=str, required=True, + help="Output folder") + + ag_peakcaller.add_argument("--gtf", dest="gtf_fp", type=str, required=True, + help="GTF filepath") + + # processing + ag_peakcaller.add_argument("-p", "--nthread", dest="nthread", type=int, default=8, + help="Number of threads; default: 8") + + ag_peakcaller.add_argument("--random-state", dest="random_state", type=int, default=777, + help="Seed for random number generator in permutations; default: 777") + + # callpeak + ag_peakcaller.add_argument("--qval-cutoff", dest="qval_cutoff", type=float, default=0.005, + help="Cutoff for adjusted p-values; default: 0.005") + + ag_peakcaller.add_argument("--merge-size", dest="merge_size", type=int, default=50, + help="Select best peak within this size; default: 50") + + ag_peakcaller.add_argument("--extend", dest="extend", type=int, default=50, + help="Extend peak to this size if less than it; default: 50") + + ag_peakcaller.add_argument("--unstranded", dest="unstranded", action="store_true", default=False, + help="Unstranded alignments if turned on") + + return + + +def add_peak_annotator_parser(subparsers): + ag_anno = subparsers.add_parser( + "peak_annotator", help="CLAM peak annotator: assign peaks to genomic regions") + + # input/output + ag_anno.add_argument("-i", "--input", dest="peak_in", type=str, required=True, + help="Input peak file") + + ag_anno.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True, + help="Genome version (hg19, hg38, mm10 avaiable)") + + ag_anno.add_argument("-o", "--out-file", dest="out_file", type=str, required=True, + help="Output file") + + return + + +def add_data_downloader_parser(subparsers): + ag_down = subparsers.add_parser( + "data_downloader", help="CLAM data downloader: download data of genomic regions") + + # input/output + ag_down.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True, + help="Genome version (hg19, hg38, mm10 avaiable)") + + return if __name__ == '__main__': - try: - main() - except KeyboardInterrupt: - sys.stderr.write("User interrupted; program terminated.") - sys.exit(0) \ No newline at end of file + try: + main() + except KeyboardInterrupt: + sys.stderr.write("User interrupted; program terminated.") + sys.exit(0) From d4ae4dbc616bda6af32f8449df648b4ca938f9d8 Mon Sep 17 00:00:00 2001 From: zj-zhang Date: Fri, 28 Jun 2019 03:06:29 -0400 Subject: [PATCH 7/9] update python3 compatible --- CLAM/peakcaller.py | 12 +++++++----- CLAM/realigner.py | 33 ++++++++++++++++++++++----------- CLAM/stats/ztnb_em.py | 2 +- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/CLAM/peakcaller.py b/CLAM/peakcaller.py index 8c0680c..98f71f4 100755 --- a/CLAM/peakcaller.py +++ b/CLAM/peakcaller.py @@ -225,7 +225,7 @@ def _neg_loglik_constrain(par, data): for i in range(intv_bin_ip.shape[0]): height = ztnb_em.collapse_data(np.floor(intv_bin_ip[i,])) height[0] = 0 - if np.sum(height.values())>0: + if np.sum([x for x in height.values()])>0: try: ll, mu, alpha = ztnb_em.EM_estim_params(height, max_iter=10, verbose=False) except: @@ -401,11 +401,13 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz return BED -def _child_peak_caller( (ip_bam_list, con_bam_list, child_gene_list, gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling, norm_lib, tot_count_dict) ): +#def _child_peak_caller(ip_bam_list, con_bam_list, child_gene_list, gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov ): +def _child_peak_caller(args): """DOCSTRING Args Returns """ + ip_bam_list, con_bam_list, child_gene_list, gene_annot, unique_only, with_control, unstranded, binsize, qval_cutoff, fold_change, min_clip_cov, pooling, norm_lib, tot_count_dict = args # open file handler for child process bam_dict = make_bam_handler_dict(ip_bam_list, con_bam_list) @@ -582,7 +584,7 @@ def peakcaller(ip_bam_list, gtf_fp, con_bam_list=None, nthread=8, else: # multi-threading on subset of genes logger.info('multi-threading') - gene_list = gene_annot.keys() + gene_list = list(gene_annot.keys()) child_gene_list = [x for x in chunkify(gene_list, nthread)] pool=Pool(processes=nthread) BED_list = pool.map( @@ -606,7 +608,7 @@ def chunkify(a, n): the chunkified index """ k, m = len(a) / n, len(a) % n - return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n)) + return (a[ int(i * k + min(i, m)) :int((i + 1) * k + min(i + 1, m))] for i in range(n)) def parser(args): @@ -644,7 +646,7 @@ def parser(args): pooling=pooling) logger.info('end') - except KeyboardInterrupt(): + except KeyboardInterrupt: sys.exit(0) return diff --git a/CLAM/realigner.py b/CLAM/realigner.py index c665bed..09cf8cf 100755 --- a/CLAM/realigner.py +++ b/CLAM/realigner.py @@ -25,7 +25,7 @@ import sys import pysam import numpy as np -from collections import defaultdict +from collections import defaultdict, deque #from tqdm import tqdm import logging import datetime @@ -205,7 +205,7 @@ def build_read_cluster(alignment, chr_dict, location_to_reads, genomic_cluster_d chr_strand = chrom+':'+strand idx = bisect.bisect_right(genomic_cluster_dict[chr_strand], site) if not idx%2: - print alignment + print(alignment) raise Exception('%s falls out of region %s'%(alignment.qname, chr_strand+':'+str(site)) ) start = genomic_cluster_dict[chr_strand][idx-1] - winsize start = 1 if start<1 else start @@ -216,11 +216,12 @@ def build_read_cluster(alignment, chr_dict, location_to_reads, genomic_cluster_d ## fetch the reads cluster_name = ':'.join([chrom, strand, str(genomic_cluster_dict[chr_strand][idx-1]), str(genomic_cluster_dict[chr_strand][idx])]) if not cluster_name in location_to_reads: - raise Exception("cannot find cluster %s in `location_to_reads`"%cluster_name) + raise Exception("cannot find cluster '%s' associated with read '%s' in `location_to_reads` of len %i"%(cluster_name, alignment.qname, len(location_to_reads))) mread_list = location_to_reads[cluster_name] - del location_to_reads[cluster_name] + #print(alignment, cluster_name) for x in mread_list: this_mread_dict_set[x.qname].add(x) + del location_to_reads[cluster_name] ## find other mreads in this cluster for read_x_qname in this_mread_dict_set: @@ -228,7 +229,7 @@ def build_read_cluster(alignment, chr_dict, location_to_reads, genomic_cluster_d discarded_mread_alignments.extend( [ x for x in list(this_mread_dict_set[read_x_qname]) ]) else: this_mread_dict[read_x_qname] = list(this_mread_dict_set[read_x_qname])[0] - + return genomic_cluster, this_mread_dict, discarded_mread_alignments @@ -275,8 +276,13 @@ def construct_subgraph(location_to_reads, read_qname, mread_dict, processed_mrea ## record those discarded alignments/reads ## note: we mark discarded_mread as processed as well, ## so as not to create a bias to less clustered regions. - _ = map(processed_mread_alignments.add, discarded_mread_list) - _ = map(processed_mreads.add, [x.qname for x in discarded_mread_list]) + # THIS IS PYTHON3 INCOMPATIBLE + #_ = map(processed_mread_alignments.add, discarded_mread_list) + #_ = map(processed_mreads.add, [x.qname for x in discarded_mread_list]) + for x in discarded_mread_list: + processed_mread_alignments.add(x) + for x in discarded_mread_list: + processed_mreads.add(x.qname) if genomic_cluster is None: # this cluster is invald (only double-mappers) continue @@ -291,7 +297,10 @@ def construct_subgraph(location_to_reads, read_qname, mread_dict, processed_mrea ## then add new alignments(edges) to generate connected nodes ## in the next iteration - _ = map(processed_mread_alignments.add, this_mread_dict.values()) + # THIS IS PYTHON3 INCOMPATIBLE + #_ = map(processed_mread_alignments.add, this_mread_dict.values()) + for x in list(this_mread_dict.values()): + processed_mread_alignments.add(x) for read_x_qname in this_mread_dict: if read_x_qname in processed_mreads: continue @@ -300,7 +309,9 @@ def construct_subgraph(location_to_reads, read_qname, mread_dict, processed_mrea ## .. and record to processed reads since we have generated ## the nodes for them - _ = map(processed_mreads.add, this_mread_dict.keys()) + #_ = map(processed_mreads.add, this_mread_dict.keys()) # this is python3 incompatible + for x in list(this_mread_dict.keys()): + processed_mreads.add(x) # if no more connected nodes can be found, break loop if len(next_read_aln_list)==0: @@ -451,7 +462,7 @@ def realigner(in_bam, out_dir, max_hits=100, max_tags=-1, read_tagger_method='me genomic_cluster_dict, winsize=winsize, unstranded=unstranded) subgraph = set() for read in read_to_locations: - _ = map(subgraph.add, read_to_locations[read].keys()) + _ = deque(map(subgraph.add, read_to_locations[read].keys())) subgraph = list(subgraph) #if len(subgraph)==1 and len(read_to_locations)>10: # raise Exception('Incorrect mread assigned to one location') @@ -516,7 +527,7 @@ def parser(args): winsize=winsize, unstranded=unstranded, retag=retag, strandness=strandness) logger.info('end') - except KeyboardInterrupt(): + except KeyboardInterrupt: sys.exit(0) return diff --git a/CLAM/stats/ztnb_em.py b/CLAM/stats/ztnb_em.py index c3135a6..85001c8 100755 --- a/CLAM/stats/ztnb_em.py +++ b/CLAM/stats/ztnb_em.py @@ -89,7 +89,7 @@ def EM_estim_params(height, tol = 10**-4, max_iter = 1000, verbose = False, mu = raise ZeroDivisionError('invalid loglik function value') error = abs((score - prev_score)/score) if verbose: - print 'Iter ' + str(i) + ': eps = ' + str(error) + '; mu = ' + str(mu) + '; alpha = ' + str(alpha) + print('Iter ' + str(i) + ': eps = ' + str(error) + '; mu = ' + str(mu) + '; alpha = ' + str(alpha)) if(error < tol): break prev_score = score From e7a82ed4ebda2bbe509f0fd310f27f24bb6acf6f Mon Sep 17 00:00:00 2001 From: zj-zhang Date: Wed, 10 Jul 2019 13:39:54 -0400 Subject: [PATCH 8/9] fix foldchange formatting in py3 --- CLAM/peakcaller.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CLAM/peakcaller.py b/CLAM/peakcaller.py index 98f71f4..0e9d37a 100755 --- a/CLAM/peakcaller.py +++ b/CLAM/peakcaller.py @@ -185,13 +185,13 @@ def _neg_loglik_constrain(par, data): alpha_ip_vec = np.empty(intv_bin_ip.shape[0]) alpha_con_vec = np.empty(intv_bin_con.shape[0]) if norm_lib and tot_count_dict is not None: - try: + if 'mbam.ip' in tot_count_dict: ip_sum = np.array(tot_count_dict['ubam.ip']) + np.array(tot_count_dict['mbam.ip'] ) - except KeyError: + else: ip_sum = np.array(tot_count_dict['ubam.ip']) - try: + if 'mbam.con' in tot_count_dict: con_sum = np.array(tot_count_dict['ubam.con']) + np.array(tot_count_dict['mbam.con']) - except KeyError: + else: con_sum = np.array(tot_count_dict['ubam.con']) else: ip_sum = np.apply_along_axis(np.sum, 1, intv_bin_ip) @@ -375,7 +375,7 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz ## "narrowPeak" format from ## https://genome.ucsc.edu/FAQ/FAQformat.html#format12 ## chr start end name 1000 strand signalValue pVal qVal peak - narrowPeak_formatter = "%s\t%i\t%i\t%s\t1000\t%s\t%s\t%.3e\t%.3e\t.\n" + narrowPeak_formatter = "%s\t%i\t%i\t%s\t1000\t%s\t%.3f\t%.3e\t%.3e\t.\n" BED = '' if len(fold_change)==1: lb = np.log(fold_change[0]) if with_control else fold_change[0] From e55fe4f11a9a95eeae76a2d705917e439cab6528 Mon Sep 17 00:00:00 2001 From: zj-zhang Date: Tue, 23 Jul 2019 19:31:28 -0400 Subject: [PATCH 9/9] merged for py3 compatible --- CLAM/config.py | 2 +- CLAM/download_data.py | 3 +-- CLAM/peak_annotator.py | 26 +++++++++--------- CLAM/peakcaller.py | 4 ++- bin/CLAM | 60 +++++++++++++++++++++--------------------- setup.py | 14 +++++++--- 6 files changed, 58 insertions(+), 51 deletions(-) diff --git a/CLAM/config.py b/CLAM/config.py index dcc32a5..711e924 100644 --- a/CLAM/config.py +++ b/CLAM/config.py @@ -3,6 +3,6 @@ """ General Version and other info """ -__version__ = '1.2.0-beta' +__version__ = '1.2.0' __author__ = 'Zijun Zhang' __email__ = 'zj.z@ucla.edu' \ No newline at end of file diff --git a/CLAM/download_data.py b/CLAM/download_data.py index e510ce8..3f8cbb8 100644 --- a/CLAM/download_data.py +++ b/CLAM/download_data.py @@ -1,7 +1,6 @@ import os import sys import subprocess -import peak_annotator def parser(args): @@ -45,7 +44,7 @@ def download_genome(genome): cmd.append('rm {genome}.zip'.format(genome=genome)) for item in cmd: subprocess.call(item, shell=True, executable='/bin/bash') - print 'Download finished' + print('Download finished') os.chdir(curr_dir) def check_genome_data(genome): diff --git a/CLAM/peak_annotator.py b/CLAM/peak_annotator.py index 8d5bf36..3907b23 100644 --- a/CLAM/peak_annotator.py +++ b/CLAM/peak_annotator.py @@ -3,7 +3,7 @@ import pybedtools import argparse as ap import logging -import download_data +from . import download_data, config ''' Assign peaks to genomic regions @@ -30,8 +30,8 @@ def parser(args): genome = args.genome out_file = args.out_file if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome): - print "Unable to locate CLAM data folder for genomic regions, will try to download." - print "Downloading..." + print("Unable to locate CLAM data folder for genomic regions, will try to download.") + print("Downloading...") download_data.download_genome(genome) genome_data = os.environ['CLAM_DAT'] intersect_gtf_regions( @@ -51,10 +51,10 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir): # input arguments # make pybedtools objects - print "Loading peaks..." + print("Loading peaks...") peaks = pybedtools.BedTool(peak_fp) - print "Peak file loaded." - print "Loading genome annotation..." + print("Peak file loaded.") + print("Loading genome annotation...") ref_dict = { 'exon': pybedtools.BedTool(os.path.join(gtf_dir, 'exons.bed')), '3UTR': pybedtools.BedTool(os.path.join(gtf_dir, '3UTRs.bed')), @@ -64,7 +64,7 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir): 'proximal200': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal200_intron.bed')), 'proximal500': pybedtools.BedTool(os.path.join(gtf_dir, 'proximal500_intron.bed')) } - print "Genome annotation loaded." + print("Genome annotation loaded.") # # process reference for use target = { @@ -80,7 +80,7 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir): 'other_exon', "px200_intron", "px500_intron", "distal_intron"] init = True - print "Intersecting peaks with genome annotation..." + print("Intersecting peaks with genome annotation...") for cat in category_list: bed_arr = [] for interval in target[cat]: @@ -99,10 +99,10 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir): target[cat], wa=True, wb=True), postmerge=False) result_bed = result_bed.sort() - print "Preparing output..." + print("Preparing output...") result_bed.saveas(outfn + '_') prepend = ['## Annotation peaks to genomic regions, all intersected genomic regions are presented.', - '## CLAM version: 1.2.0', + '## CLAM version: %s'%config.__version__, '## Column 1: Peak chromosome', '## Column 2: Peak start', '## Column 3: Peak end', @@ -129,7 +129,7 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir): os.system('cat {outtmp} >> {outfn}'.format( outtmp=outfn + '_', outfn=outfn)) os.remove(outfn+'_') - print "DONE" + print("DONE") if __name__ == '__main__': @@ -137,8 +137,8 @@ def intersect_gtf_regions(peak_fp, outfn, gtf_dir): os.chdir('/mnt/h/yi_lab/m6a/src/scripts/peakComposition') peak_in, genome, out_file = 'narrow_peak.unique.bed', 'mm10', 'annotate_peak.bed' if 'CLAM_DAT' not in os.environ or not download_data.check_genome_data(genome): - print "Unable to find CLAM data folder for genomic regions, please try to download it using download_genome command." - print "Downloading..." + print("Unable to find CLAM data folder for genomic regions, please try to download it using download_genome command.") + print("Downloading...") download_data.download_genome(genome) genome_data = os.environ['CLAM_DAT'] intersect_gtf_regions( diff --git a/CLAM/peakcaller.py b/CLAM/peakcaller.py index 0e9d37a..d512b7d 100755 --- a/CLAM/peakcaller.py +++ b/CLAM/peakcaller.py @@ -375,7 +375,7 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz ## "narrowPeak" format from ## https://genome.ucsc.edu/FAQ/FAQformat.html#format12 ## chr start end name 1000 strand signalValue pVal qVal peak - narrowPeak_formatter = "%s\t%i\t%i\t%s\t1000\t%s\t%.3f\t%.3e\t%.3e\t.\n" + narrowPeak_formatter = "%s\t%i\t%i\t%s\t1000\t%s\t%s\t%.3e\t%.3e\t.\n" BED = '' if len(fold_change)==1: lb = np.log(fold_change[0]) if with_control else fold_change[0] @@ -397,6 +397,8 @@ def call_gene_peak(bam_dict, gene, unique_only=False, with_control=False, binsiz strand = gene[3] peak_num += 1 peak_name = gene[4] + '-%i'%peak_num + if with_control: + signal = "%.3f"%(float(signal)) BED += narrowPeak_formatter % (chr, binstart, binend, peak_name, strand, signal, pval, qval) return BED diff --git a/bin/CLAM b/bin/CLAM index d93ce03..ef9fe85 100755 --- a/bin/CLAM +++ b/bin/CLAM @@ -65,17 +65,17 @@ def main(): #print args peakcaller.parser( args ) - elif subcommand == 'permutation_callpeak': + elif subcommand == 'permutation_callpeak': from CLAM import permutation_peakcaller permutation_peakcaller.parser( args ) - - elif subcommand == 'peak_annotator': - from CLAM import peak_annotator - peak_annotator.parser(args) - - elif subcommand == 'data_downloader': - from CLAM import download_data - download_data.parser(args) + + elif subcommand == 'peak_annotator': + from CLAM import peak_annotator + peak_annotator.parser(args) + + elif subcommand == 'data_downloader': + from CLAM import download_data + download_data.parser(args) def setup_logger(): @@ -131,12 +131,12 @@ def get_arg_parser(): # permutation_callpeak add_permutation_callpeak_parser(subparsers) - # peak_annotator - add_peak_annotator_parser(subparsers) + # peak_annotator + add_peak_annotator_parser(subparsers) + + # data_downloader + add_data_downloader_parser(subparsers) - # data_downloader - add_data_downloader_parser(subparsers) - return argparser @@ -293,31 +293,31 @@ def add_permutation_callpeak_parser( subparsers ): def add_peak_annotator_parser(subparsers): - ag_anno = subparsers.add_parser( - "peak_annotator", help="CLAM peak annotator: assign peaks to genomic regions") + ag_anno = subparsers.add_parser( + "peak_annotator", help="CLAM peak annotator: assign peaks to genomic regions") - # input/output - ag_anno.add_argument("-i", "--input", dest="peak_in", type=str, required=True, - help="Input peak file") + # input/output + ag_anno.add_argument("-i", "--input", dest="peak_in", type=str, required=True, + help="Input peak file") - ag_anno.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True, - help="Genome version (hg19, hg38, mm10 avaiable)") + ag_anno.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True, + help="Genome version (hg19, hg38, mm10 avaiable)") - ag_anno.add_argument("-o", "--out-file", dest="out_file", type=str, required=True, - help="Output file") + ag_anno.add_argument("-o", "--out-file", dest="out_file", type=str, required=True, + help="Output file") - return + return def add_data_downloader_parser(subparsers): - ag_down = subparsers.add_parser( - "data_downloader", help="CLAM data downloader: download data of genomic regions") + ag_down = subparsers.add_parser( + "data_downloader", help="CLAM data downloader: download data of genomic regions") - # input/output - ag_down.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True, - help="Genome version (hg19, hg38, mm10 avaiable)") + # input/output + ag_down.add_argument("-g", "--genome", dest="genome", choices=('hg19', 'hg38', 'mm10'), type=str, required=True, + help="Genome version (hg19, hg38, mm10 avaiable)") - return + return diff --git a/setup.py b/setup.py index d67593c..5c3ff2f 100755 --- a/setup.py +++ b/setup.py @@ -1,11 +1,11 @@ #!/usr/bin/env python from setuptools import setup - +from CLAM.config import __version__ def main(): setup(name='CLAM', - version='1.2.0-beta', + version=__version__, description='CLIP-seq Analysis of Multi-mapped reads', author='Zijun Zhang', author_email='zj.z@ucla.edu', @@ -13,8 +13,14 @@ def main(): packages=['CLAM', 'CLAM.stats'], scripts=['bin/CLAM'], install_requires=[ - #'pysam>0.12,<0.2', - 'numpy'] + 'scipy', + 'pysam', + 'numpy', + 'multiprocessing', + 'statsmodels', + 'tqdm', + 'pybedtools' + 'mpmath'] ) return