forked from petercombs/HybridSliceSeq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GetGeneASE.py
488 lines (399 loc) · 17.2 KB
/
GetGeneASE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
#!/usr/bin/python
# vim: set noexpandtab tabstop=4
# Created on: 2015.03.19
# Author: Carlo Artieri
##############################
# HISTORY AND THINGS TO FIX #
##############################
#2015.03.19 - Initial script
#
#2015.03.20 - Outlined code and added in proper stranded counting.
#
#2015.03.21 - Realized that keys in the output dicts. are not being initialized in all dicts.
# Fix.
#
#2015.03.23 - Found some bugs in the way the script was outputting the SNP counts in the SNP
# string. Fixed.
# - Script fully functional.
#
#2015.03.24 - Added in the option to output phased SNP-level ASE as this may be usefult for
# downstream analyses.
#
###########
# MODULES #
###########
import sys #Access to simple command-line arguments
sys.path.append('/Users/carloartieri/bin/python') #Set python path for common functions
import argparse #Access to long command-line parsing
import datetime #Access to calendar/clock functions
import re #Access to REGEX splitting
import math #Access to math functions
import subprocess #Access to external command-line
import os #Access to external command-line
import textwrap #Add text block wrapping properties
from time import sleep #Allow system pausing
from collections import defaultdict
#import common #My custom common python scripts
##########################
# COMMAND-LINE ARGUMENTS #
##########################
epilog = """\
NOTE: SNPs that overlap multiple features on the same strand (or counting from unstranded
libraries) will be counted in EVERY feature that they overlap. It is important to
filter the annotation to count features of interest!
Detailed description of inputs/outputs follows:
-p/--phasedsnps
A tab-delimited BED file with positions of masked SNPs of interest as follows:
[CHR] [0 POSITION] [1 POSITION] [REF|ALT]
The fourth column MUST contain the phased SNPs alleles.
-g/--gff
The script accepts both GTF and GFF annotation files. This should be combined with
the -i/--identifier option specifying the identifier in the info column (column 9)
that will be used for grouping counts. For example, in a GTF 'gene_id' will group
counts by gene with 'transcript_id' with group counts by transcript. In addition,
the -t/--type option sets the feature type (column 3) from which to pull features
typically you'd want to count from 'exon', but many annotations may use non-standard
terms.
-m/--min
This sets the minimum # of reads required to include a SNP in the calculation of the
fraction of SNPs agreeing in allelic direction.
-w/--writephasedsnps
If this is specified, then the program will output an additional output file named
[OUTFILE].snp.txt with phased SNP-level ASE calls. This can be useful for checking
SNP consistency across samples. See below for a description of the output.
-s/--stranded
If the data come from a stranded library prep, then this option will only count reads
mapped to the corresponding strand.
OUTPUT:
The output of the script is a tab-delimited text file set by -o/--outfile, which contains
the following columns:
FEATURE Name of the counted feature
CHROMOSOME Chromosome where feature is found
ORIENTATION Orientation of feature (+/-)
START-STOP Ultimate 5' and 3' 1-based start and stop positions
REFERENCE_COUNTS Total reference allele counts across SNPS (or first allele in the REF|ALT phasing)
ALT_COUNTS Total alternate allele counts across SNPs (or second allele in the REF|ALT phasing)
TOTAL_SNPS The total number of SNPs overlapped by the feature
REF_BIASED Number of REF biased SNPs passing the -m/--min threshold
ALT_BIASED Number of ALT biased SNPs passing the -m/--min threshold
REF-ALT_RATIO The proportion of SNPs agreeing in direction (0.5 - 1)
SNPS A list of all SNPs overlapped by the feature separated by ';' and of the format:
[1-based position],[REF_ALLELE]|[ALT_ALLELE],[REF_COUNTS]|[ALT_COUNTS];
If the -w/--writephasedsnps option has been set, it will produce a tab-delimited table with the
following columns:
CHROMOSOME Chromosome where SNP is found
POSITION 1-based position
FEATURE Feature in which SNP is found
ORIENTATION Orientation of feature (if stranded only reads on this strand are counted)
REFERENCE_ALLELE Reference base
ALTERNATE_ALLELE Alternate base
REF_COUNTS Reference base counts
ALT_COUNTS Alternate base counts
"""
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description='This script takes the output of CountSNPASE.py and generates gene level ASE counts.', add_help=False, epilog=epilog, formatter_class=CustomFormatter)
req = parser.add_argument_group('Required arguments:')
req.add_argument('-c','--snpcounts', action="store", dest="snpcounts", help='SNP-level ASE counts from CountSNPASE.py ', required=True, metavar='')
req.add_argument('-p','--phasedsnps', action="store", dest="phasedsnps", help='BED file of phased SNPs', required=True, metavar='')
req.add_argument('-g','--gff', action="store", dest="gff", help='GFF/GTF formatted annotation file', required=True, metavar='')
req.add_argument('-o','--outfile', action="store", dest="outfile", help='Gene-level ASE counts output', required=True, metavar='')
opt = parser.add_argument_group('Optional arguments:')
opt.add_argument('-w','--writephasedsnps', action="store_true", dest="write", help='Write a phased SNP-level ASE output file [OUTFILE].snps.txt')
opt.add_argument('-i','--identifier', action="store", dest="id", help='ID attribute in information column', default='gene_id', metavar='')
opt.add_argument('-t','--type', action="store", dest="type", help='Annotation feature type', default='exon', metavar='')
opt.add_argument('-m','--min', action="store", dest="min", type=int, help='Min reads to calculate proportion ref/alt biased', default=10)
opt.add_argument('-M', '--allele-min', type=int, default=5, help='Minimum reads per allele to count the SNP (helps with false hets')
opt.add_argument('-s','--stranded', action="store_true", dest="stranded", help='Data are stranded? [Default: False]')
opt.add_argument('-H', '--true-hets', default=None, help="Tab-separated file"
'containing validated true hets')
opt.add_argument('-h', '--help', action="help", help="Show this help message and exit")
opt.add_argument('-P', '--preference-index', action="store_true", default=False, help="Use preference index for the REF-ALT_RATIO column (range -1 -- 1)")
args = parser.parse_args()
if args.true_hets:
with open(args.true_hets) as hetfile:
args.true_hets = set()
for line in hetfile:
chrom, pos = line.strip().split()
args.true_hets.add((chrom, pos))
print("Minimum per allele: ", args.allele_min)
#############
# FUNCTIONS #
#############
##########
# SCRIPT #
##########
#Read in the SNP-level ASE counts
snp_counts_dict = {}
count_file = open(args.snpcounts, 'r')
for line in count_file:
if 'SUM_POS_READS' in line:
continue
line = line.rstrip('\n')
line_t = line.split('\t')
pos = line_t[0] + '|' + str(line_t[1])
pos_counts = line_t[2].split('|')
neg_counts = line_t[3].split('|')
bases = ['A','C','G','T']
pos_dict = defaultdict(int)
neg_dict = defaultdict(int)
for i in range(len(pos_counts)):
pos_dict[bases[i]] = int(pos_counts[i])
neg_dict[bases[i]] = int(neg_counts[i])
snp_counts_dict[pos] = [pos_dict, neg_dict]
count_file.close()
#Read in the SNP phasing information
snp_phase_dict = {}
snp_file = open(args.phasedsnps, 'r')
for line in snp_file:
line = line.rstrip('\n')
line_t = line.split('\t')
pos = str(line_t[0]) + '|' + str(line_t[2])
snp_phase_dict[pos] = line_t[3]
snp_file.close()
#Initialize variables for gene-level counts
chromosome = {}
position = {}
ori = {}
total_ref = {}
total_alt = {}
total_snps = {}
ref_biased = {}
alt_biased = {}
snp_array = defaultdict(list)
phased_snp_array = [] #15.03.24 - For phased SNP output file.
features = {} #We have to store a list of all of the features somewhere
#Now go through the GFF/GTF and concatenate the approporiate info
gff_file = open(args.gff, 'r')
for line in gff_file:
line = line.rstrip('\n')
line_t = line.split('\t')
if line_t[2] != args.type:
continue
line_t8 = line_t[8].split(';')
#Check for custom identifier and determine filetype:
if args.id + '=' in line_t[8]: #GFF
for i in line_t8:
i.strip()
if args.id + '=' in i:
if args.id in i:
i2 = i.split('=')
name = i2[1]
elif args.id + ' ' in line_t[8]: #GTF
for i in line_t8:
i.strip()
if args.id + ' ' in line_t[8]:
if args.id in i:
i2 = i.split('"')
name = i2[1]
else:
print 'ID attribute "' + args.id + '" doesn\'t exist or GFF/GTF file not properly formatted.'
print 'GFF info column format is: attribute=value;'
print 'GTF info column format is: attribute "value";\n'
sys.exit(1)
features[name] = 1
chromosome[name] = line_t[0]
ori[name] = line_t[6]
orientation = line_t[6]
if name not in position:
position[name] = []
position[name].append(int(line_t[3]))
position[name].append(int(line_t[4]))
else:
position[name].append(int(line_t[3]))
position[name].append(int(line_t[4]))
#Now go through the positions overlapped by the annotation and add in SNPs if appropriate
for i in range(int(line_t[3]),int(line_t[4])+1):
pos = line_t[0] + '|' + str(i)
if pos in snp_counts_dict and pos in snp_phase_dict:
#Count SNPs
if name in total_snps:
total_snps[name] += 1
else:
total_snps[name] = 0
total_snps[name] += 1
#Get REF|ALT counts
#Parse the REF|ALT dict
refalt = snp_phase_dict[pos].split('|')
#Get pos/neg counts
ref_pos_counts = snp_counts_dict[pos][0][refalt[0]]
ref_neg_counts = snp_counts_dict[pos][1][refalt[0]]
alt_pos_counts = snp_counts_dict[pos][0][refalt[1]]
alt_neg_counts = snp_counts_dict[pos][1][refalt[1]]
#If stranded add only appropriate strand counts
if args.stranded is True:
if orientation == '+':
if min(int(ref_pos_counts), int(alt_pos_counts)) < args.allele_min:
continue
if (args.true_hets and tuple(pos.split('|')) not in args.true_hets):
continue
if name in total_ref:
total_ref[name] += int(ref_pos_counts)
else:
total_ref[name] = 0
total_ref[name] += int(ref_pos_counts)
if name in total_alt:
total_alt[name] += int(alt_pos_counts)
else:
total_alt[name] = 0
total_alt[name] += int(alt_pos_counts)
if name not in ref_biased:
ref_biased[name] = 0
if name not in alt_biased:
alt_biased[name] = 0
#Determine if ref or alt biased
if ref_pos_counts + alt_pos_counts >= args.min:
if ref_pos_counts > alt_pos_counts:
ref_biased[name] += 1
elif ref_pos_counts < alt_pos_counts:
alt_biased[name] += 1
#Add it to the total SNP arrays
if name in snp_array:
snp_array[name].append(str(i) + ',' + str(snp_phase_dict[pos]) + ',' + str(ref_pos_counts) + '|' + str(alt_pos_counts))
phased_snp_array.append(str(chromosome[name]) + '\t' + str(i) + '\t' + name + '\t' + ori[name] + '\t' + str(refalt[0]) + '\t' + str(refalt[1]) + '\t' + str(ref_pos_counts) + '\t' + str(alt_pos_counts))
else:
snp_array[name] = []
snp_array[name].append(str(i) + ',' + str(snp_phase_dict[pos]) + ',' + str(ref_pos_counts) + '|' + str(alt_pos_counts))
phased_snp_array.append(str(chromosome[name]) + '\t' + str(i) + '\t' + name + '\t' + ori[name] + '\t' + str(refalt[0]) + '\t' + str(refalt[1]) + '\t' + str(ref_pos_counts) + '\t' + str(alt_pos_counts))
elif orientation == '-':
if min(int(ref_neg_counts), int(alt_neg_counts)) < args.allele_min:
if name not in snp_array:
snp_array[name] = []
snp_array[name].append('**{},{},{}|{}**'.format(i, snp_phase_dict[pos], tot_ref, tot_alt))
continue
if (args.true_hets and tuple(pos.split('|')) not in args.true_hets):
continue
if name in total_ref:
total_ref[name] += int(ref_neg_counts)
else:
total_ref[name] = 0
total_ref[name] += int(ref_neg_counts)
if name in total_alt:
total_alt[name] += int(alt_neg_counts)
else:
total_alt[name] = 0
total_alt[name] += int(alt_neg_counts)
if name not in ref_biased:
ref_biased[name] = 0
if name not in alt_biased:
alt_biased[name] = 0
#Determine if ref or alt biased
if ref_neg_counts + alt_neg_counts >= args.min:
if ref_neg_counts > alt_neg_counts:
ref_biased[name] += 1
elif ref_neg_counts < alt_neg_counts:
alt_biased[name] += 1
#Add it to the total SNP array
if name in snp_array:
snp_array[name].append(str(i) + ',' + str(snp_phase_dict[pos]) + ',' + str(ref_neg_counts) + '|' + str(alt_neg_counts))
phased_snp_array.append(str(chromosome[name]) + '\t' + str(i) + '\t' + name + '\t' + ori[name] + '\t' + str(refalt[0]) + '\t' + str(refalt[1]) + '\t' + str(ref_neg_counts) + '\t' + str(alt_neg_counts))
else:
snp_array[name] = []
snp_array[name].append(str(i) + ',' + str(snp_phase_dict[pos]) + ',' + str(ref_neg_counts) + '|' + str(alt_neg_counts))
phased_snp_array.append(str(chromosome[name]) + '\t' + str(i) + '\t' + name + '\t' + ori[name] + '\t' + str(refalt[0]) + '\t' + str(refalt[1]) + '\t' + str(ref_neg_counts) + '\t' + str(alt_neg_counts))
else:
if (args.true_hets and tuple(pos.split('|')) not in args.true_hets):
snp_array[name].append('##{},{},{}|{}##'.format(i,
snp_phase_dict[pos],
ref_pos_counts+ref_neg_counts,
alt_pos_counts+alt_neg_counts))
continue
if min((int(ref_pos_counts)+int(ref_neg_counts)),
(int(alt_pos_counts) + int(alt_neg_counts))) < args.allele_min:
snp_array[name].append('**{},{},{}|{}**'.format(i,
snp_phase_dict[pos],
ref_pos_counts+ref_neg_counts,
alt_pos_counts+alt_neg_counts))
continue
if name in total_ref:
total_ref[name] += int(ref_pos_counts)
total_ref[name] += int(ref_neg_counts)
else:
total_ref[name] = 0
total_ref[name] += int(ref_pos_counts)
total_ref[name] += int(ref_neg_counts)
if name in total_alt:
total_alt[name] += int(alt_pos_counts)
total_alt[name] += int(alt_neg_counts)
else:
total_alt[name] = 0
total_alt[name] += int(alt_pos_counts)
total_alt[name] += int(alt_neg_counts)
#Determine if ref or alt biased
if name not in ref_biased:
ref_biased[name] = 0
if name not in alt_biased:
alt_biased[name] = 0
tot_ref = ref_pos_counts + ref_neg_counts
tot_alt = alt_pos_counts + alt_neg_counts
if tot_ref + tot_alt >= args.min:
if tot_ref > tot_alt:
if name in ref_biased:
ref_biased[name] += 1
else:
ref_biased[name] = 0
ref_biased[name] += 1
if name in alt_biased:
pass
else:
alt_biased[name] = 0
elif tot_ref < tot_alt:
if name in alt_biased:
alt_biased[name] += 1
else:
alt_biased[name] = 0
alt_biased[name] += 1
if name in ref_biased:
pass
else:
ref_biased[name] = 0
#Add it to the total SNP array
if name in snp_array:
snp_array[name].append(str(i) + ',' + str(snp_phase_dict[pos]) + ',' + str(int(tot_ref)) + '|' + str(int(tot_alt)))
else:
snp_array[name] = []
snp_array[name].append(str(i) + ',' + str(snp_phase_dict[pos]) + ',' + str(int(tot_ref)) + '|' + str(int(tot_alt)))
gff_file.close()
#Print the output
keys = features.keys()
keys.sort()
outfile = open(args.outfile, 'w')
#Header
outfile.write('FEATURE\tCHROMOSOME\tORIENTATION\tSTART-STOP\tREFERENCE_COUNTS\tALT_COUNTS\tTOTAL_SNPS\tREF_BIASED\tALT_BIASED\tREF-ALT_RATIO\tSNPS\n')
for key in keys:
#Get the ultimate 5'-3' positions
position[key].sort(key=int)
posit = str(position[key][0]) + '-' + str(position[key][-1])
if key in total_ref:
pos = key.split('|')
if args.preference_index:
if total_ref[key] == 0 and total_alt[key] == 0:
rat = 0
else:
rat = (total_alt[key] - total_ref[key])/float(total_ref[key] + total_alt[key])
else:
if ref_biased[key] >= alt_biased[key]:
if alt_biased[key] == 0:
rat = 1
else:
rat = ref_biased[key]/float(alt_biased[key]+ref_biased[key])
else:
if ref_biased[key] == 0:
rat = 1
else:
rat = alt_biased[key]/float(ref_biased[key]+alt_biased[key])
snp_array_out = ';'.join(snp_array[key])
outfile.write(str(key) + '\t' + str(chromosome[key]) + '\t' + str(ori[key]) + '\t' + str(posit) + '\t' + str(total_ref[key]) + '\t' + str(total_alt[key]) + '\t' + str(total_snps[key]) + '\t' + str(ref_biased[key]) + '\t' + str(alt_biased[key]) + '\t' + str(rat) + '\t' + str(snp_array_out) + '\n')
else: #No counts for this feature
if key in snp_array:
outfile.write(str(key) + '\t' + str(chromosome[key]) + '\t' + str(ori[key]) + '\t' + str(posit) + '\tNA\tNA\tNA\tNA\tNA\tNA\t'+';'.join(snp_array[key])+'\n')
else:
outfile.write(str(key) + '\t' + str(chromosome[key]) + '\t' + str(ori[key]) + '\t' + str(posit) + '\tNA\tNA\tNA\tNA\tNA\tNA\tNA\n')
outfile.close()
if args.write is True:
outfile = open(args.outfile + '.snps.txt', 'w')
#Header
outfile.write('CHROMOSOME\tPOSITION\tFEATURE\tORIENTATION\tREFERENCE_ALLELE\tALTERNATE_ALLELE\tREF_COUNTS\tALT_COUNTS\n')
for i in phased_snp_array:
outfile.write(i + '\n')
outfile.close()