-
Notifications
You must be signed in to change notification settings - Fork 0
/
CodonUsage.py
67 lines (61 loc) · 2.63 KB
/
CodonUsage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""
This function gives out the codon frequency of a file of sequences.
It takes a file of sequences, two files for output and a boolean to indicate if file is positive set or not.
It gives two files, each for upstream and downstream codon frequency.
"""
def Codon(inputfile, upfile, downfile, positive):
import json
lines = [l for l in open(inputfile).readlines()]
u,d = 0,0 #counter for number of codons
dna = ['A','C','T','G']
upstream = dict() #dictionary for upstream codon frequency
downstream = dict() #dictionary for downstream codon frequency
for line in lines:
line = line.strip()
# only get sequence with codons in frame with central ATG
if positive==False:
line1 = line[2:149]
line2 = line[152:299]
#for positive set, same as negative, but does not take into account the codons for consensus
else:
line1 = line[2:137]
line2 = line[164:299]
#get each codon from the sequence
for n in range(0,len(line1),3):
codon = line1[n:n+3]
#do not take it into account if the codon contains non DNA letters
if line1[n] not in dna or line1[n+1] not in dna or line1[n+2] not in dna:
continue
u += 1
if codon in upstream:
upstream[codon] += 1
else:
upstream[codon] = 1
for n in range(0, len(line2), 3):
codon = line2[n:n + 3]
# do not take it into account if the codon contains non DNA letters
if line2[n] not in dna or line2[n + 1] not in dna or line2[n + 2] not in dna:
continue
d += 1
if codon in downstream:
downstream[codon] += 1
else:
downstream[codon] = 1
#remove upstream start codons from positive data
if positive == True:
u -= upstream['ATG']
upstream.pop('ATG', None)
#remove all downstream stop codons
stop = ['TAA','TAG','TGA']
for j in stop:
d -= downstream[j]
downstream.pop(j, None)
print("number of upstream codons: {:>10}".format(u))
print("number of downstream codons: {:>10}".format(d))
#divide by number of codons and multiply 1000 for all values
upstream = {k: round(v/u*1000, 2) for k,v in upstream.items()}
downstream = {k: round(v/d*1000, 2) for k, v in downstream.items()}
with open(upfile, 'w') as up:
json.dump(upstream, up, indent=4)
with open(downfile, 'w') as down:
json.dump(downstream, down, indent=4)