-
Notifications
You must be signed in to change notification settings - Fork 0
/
ConsensusSequence.py
72 lines (62 loc) · 2.69 KB
/
ConsensusSequence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
This python script makes a position weight matrix of the consensus sequence around TIS.
It reads a file of genomic sequences and gives another file of consensus sequence.
"""
#readFasta reads the fasta file given and
# formats it into the consensus sequence (length of 20 bp)
# The file has a ATG in the middle and has same number of nucleotides up/downstream of the ATG
def readFasta(file):
#list to save the consensus sequences
cons = []
#read the fasta file line by line
for seq in open(file, 'r').readlines():
#find the length of the up/downstream sequences of TIS
# assumes that the upstream and downstream have the same length
l = int((len(seq)-3)/2)
#find the upstream consensus sequence (length 10)
up = seq[l-10:l]
#print(len(up))
#find the downstream consensus sequence (length 10)
down = seq[l+3:l+13]
#print(len(down))
#add the consensus sequence to the list
cons += [up+down]
return cons
#The consensus function takes in a file of TIS sequences and produces the position weight matrix of the consensus sequence
# The sequence has equal number of up/downstream divided by ATG
def Consensus(inputfile, outputfile):
import json
#get the consensus sequences of each sequence in list form
cons = readFasta(inputfile)
l = len(cons)
#Each list in consensus represent each row of pwm; each column represents the position on the sequence
#consensus = [['A'],['C'],['G'],['T']]
consensus = dict()
position = [p for p in range(-10,13) if p not in [0,1,2]]
#Create a loop ranging along the length of the consensus
for i in range(0, 20):
#Reset the values for each nucleotide to 0
# These will count the number of occurrences of the nucleotide on a certain position
A,C,G,T = 0,0,0,0
#Create another loop that reads each consensus sequence
# A nucleotide of certain position of all the sequences in the file will be put forward
for j in cons:
#Add to the nucleotide value if that nucleotide exists
if j[i] == 'A':
A += 1
elif j[i] == 'C':
C += 1
elif j[i] == 'G':
G += 1
elif j[i] == 'T':
T += 1
#After adding all the occurences of each nucleotide, divide it by the total number of seqneuces
A = round(A/l*100)
C = round(C/l*100)
G = round(G/l*100)
T = round(T / l * 100)
#add all to dictionary
consensus[position[i]] = [A,C,G,T]
with open(outputfile, 'w') as out:
json.dump(consensus, out, indent=10)
Consensus('at_pos_dic2013.txt','testfile.txt')