-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
211 lines (187 loc) · 7.27 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# Bioinformatics:: Motif Finder
# A transcription factor regulates a gene by binding to a specific short DNA interval called a regulatory motif,
# or transcription factor binding site, in the gene’s upstream region, a 600-1000 nucleotide-long region preceding the start of the gene.
import random
# Creates a matrix counting the number of occurrences of each nucleotide in each column of the motif matrix.
def Count(Motifs):
count = {}
col = len(Motifs[0])
for symbol in "ACGT":
count[symbol] = []
for j in range(col):
count[symbol].append(0)
row = len(Motifs)
for i in range(row):
for j in range(col):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
# Takes a list of strings Motifs as input and returns the count matrix of Motifs with pseudocounts as a dictionary of lists.
def CountWithPseudocounts(Motifs):
count = {}
col = len(Motifs[0])
for symbol in "ACGT":
count[symbol] = []
for j in range(col):
count[symbol].append(1)
row = len(Motifs)
for i in range(row):
for j in range(col):
symbol = Motifs[i][j]
count[symbol][j] += 1
return count
# Profile(Motifs) that takes Motifs as input and returns their profile matrix as a dictionary of lists.
def Profile(Motifs):
profile = Count(Motifs)
for key, entry in profile.items():
for i in range(len(entry)):
entry[i] = entry[i]/len(Motifs)
return profile
# Takes a list of strings Motifs as input and returns the profile matrix of Motifs with pseudocounts as a dictionary of lists.
def ProfileWithPseudocounts(Motifs):
profile = CountWithPseudocounts(Motifs)
for key, entry in profile.items():
for i in range(len(entry)):
entry[i] = entry[i]/(len(Motifs)+4)
return profile
# Input: A set of kmers Motifs.
# Output: A consensus string of Motifs.
def Consensus(Motifs):
k = len(Motifs[0])
count = Count(Motifs)
consensus = ""
for j in range(k):
m = 0
frequentSymbol = ""
for symbol in "ACGT":
if count[symbol][j] > m:
m = count[symbol][j]
frequentSymbol = symbol
consensus += frequentSymbol
return consensus
def HammingDistance(p, q):
count = 0
for i in range(0, len(p)):
if p[i] != q[i]:
count += 1
return count
# Input: A set of k-mers Motifs
# Output: The score of these k-mers. Smaller score indicate that the sequence is the more accurate motif.
def Score(Motifs):
# Insert code here
score = 0
for i in range(len(Motifs)):
score += HammingDistance(Motifs[i], Consensus(Motifs))
return score
def compute(profile):
v = 1
for k,e in profile.items():
print(sum(e))
v = sum(e) * v
print(v)
# Determines the overall probability of a sequence for a motif based on individual base probability profiles
def Pr(Text, Profile):
prob = 1
for i in range(len(Text)):
prob *= Profile[Text[i]][i]
return prob
# Given profile matrix, this code computes the probability of every k-mer in a string Text and find a Profile-most probable k-mer in Text.
def ProfileMostProbableKmer(text, k, profile):
p = {}
for i in range(0,len(text)-k):
pattern = text[i:i+k]
prob = Pr(pattern, profile)
p[pattern] = prob
return max(p,key=p.get)
# Dind the set of motifs across a number of DNA sequences that match each other most closely.
# k = k-mer length; t = number of strings in Dna.
def GreedyMotifSearch(Dna,k,t):
BestMotifs = []
for i in range(0, t):
BestMotifs.append(Dna[i][0:k])
n = len(Dna[0])
for m in range(n-k+1):
Motifs = []
Motifs.append(Dna[0][m:m+k])
for j in range(1, t):
P = Profile(Motifs[0:j])
Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
if Score(Motifs) < Score(BestMotifs):
BestMotifs = Motifs
return BestMotifs
# Note to change consensus function with pseudocount call.
def GreedyMotifSearchWithPseudocounts(Dna, k, t):
BestMotifs = [] # output variable
for i in range(0, t):
BestMotifs.append(Dna[i][0:k])
n = len(Dna[0])
for m in range(n-k+1):
Motifs = []
Motifs.append(Dna[0][m:m+k])
for j in range(1, t):
P = ProfileWithPseudocounts(Motifs[0:j])
Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
if Score(Motifs) < Score(BestMotifs):
BestMotifs = Motifs
return BestMotifs
# Takes a profile matrix Profile corresponding to a list of strings Dna as input and returns a list of the Profile-most probable k-mers in each string from Dna.
def Motifs(Profile, Dna, k):
result = []
for i in range(len(Dna)):
result.append(ProfileMostProbableKmer(Dna[i], k, Profile))
return result
# Random method of motif fnding, uses random.randint to choose a random k-mer from each of t different strings Dna, and returns a list of t strings.
def RandomMotifs(Dna, k, t):
result = []
for i in range(t):
idx = random.randint(0,len(Dna[0])-k)
result.append(Dna[i][idx:idx+k])
return result
# Conducts a randomized motif search, where Dna is a series of genomic sequence.
def RandomizedMotifSearch(Dna, k, t):
M = RandomMotifs(Dna, k, t)
BestMotifs = M
while True:
Profile = ProfileWithPseudocounts(M)
M = Motifs(Profile, Dna, k)
if Score(M) < Score(BestMotifs):
BestMotifs = M
else:
return BestMotifs
# Input: A dictionary Probabilities, where keys are k-mers and values are the probabilities of these k-mers (which do not necessarily sum up to 1) Output: A normalized dictionary where the probability of each k-mer was divided by the sum of all k-mers' probabilities
def Normalize(Probabilities):
total = sum(Probabilities.values())
for key, entry in Probabilities.items():
Probabilities[key] = entry / total
return Probabilities
# Input: A dictionary Probabilities whose keys are k-mers and whose values are the probabilities of these kmers
# Output: A randomly chosen k-mer with respect to the values in Probabilities
def WeightedDie(Probabilities):
n = random.uniform(0, 1)
for p in Probabilities:
n -= Probabilities[p]
if n <= 0:
return p
# Takes a string Text, a profile matrix profile , and an integer k as input. Returns a randomly generated k-mer from Text whose probabilities are generated from profile.
def ProfileGeneratedString(Text, profile, k):
n = len(Text)
probabilities = {}
for i in range(0,n-k+1):
probabilities[Text[i:i+k]] = Pr(Text[i:i+k], profile)
probabilities = Normalize(probabilities)
return WeightedDie(probabilities)
# Another method of motif finding using random sampling to create the probabilities profile
def GibbsSampler(Dna, k, t, N):
Motifs = RandomMotifs(Dna, k, t)
BestMotifs = Motifs
for i in range(1,N):
num = random.randint(0,t-1)
m = []
for j in Motifs:
if j != Motifs[num]:
m.append(j)
profile = ProfileWithPseudocounts(m)
Motifs[num] = ProfileGeneratedString(Dna[num], profile, k)
if Score(Motifs) < Score(BestMotifs):
BestMotifs = Motifs
return BestMotifs