-
Notifications
You must be signed in to change notification settings - Fork 0
/
outlaw_code.py
142 lines (112 loc) · 5.95 KB
/
outlaw_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from __future__ import division
import json
import re
import csv
import os
import glob
os.chdir("../../COG_MBARI_2014")
internal_count = 0
external_count = 0
internal_placement_count = 0
leaf_placement_count = 0
total_placement_count = 0
jplace = glob.glob("*.jplace")
for file in jplace:
with open(file) as json_data:
data = json.load(json_data)
##json parse--extract tree string
tree = data['tree']
branches = re.split('[, ( )]' , tree)##splitting at ')' '(' and ',' eliminates all delimiters that are not curly braces--
# leaves either empty elements ex: u'' or elements withedge numbers/labels/lengths in each element of the list
#initialize totalEdgeCount at -1 because the root is indicated by a {x} but we dont want to count that in the edgeCount variable
totalEdgeCount = -1
leafCount = 0
internalCount = 0
#initialize lists to place leaf and internal edge numbers
leafEdges = []
internalEdges = []
# i is integer index of each element in list
# v is value (string) at each index
for i,v in enumerate(branches):
if '{' in v:
totalEdgeCount += 1
if "|" in v:
leafCount += 1
leaf_edgeNum = int(v.split('{')[1].split('}')[0])
leafEdges.append(leaf_edgeNum)
elif v != '':
internal_edgeNum = int(v.split('{')[1].split('}')[0])
internalCount += 1
internalEdges.append(internal_edgeNum)
placements = data['placements'] #placement key in json
#initialize lists
leafList = []
internalList = []
total_placements = 0
leafsWithPlacement = []
internalWithPlacement = []
# there are i pquery elements in the 'placements list--for loop iterates through each pquery--basically placements[i]
#each pquery consists of a unicode list of placements (u'p') and short-read names (u'nm')
for i in placements: #it works, but I am guessing there is a more efficient way to dig deep into a dictionary
#verify total number of placements in the file
total_placements += 1
#i.values() retrieves the values (p and nm) of the unicode list (u'p', u'nm')
#indeces: [0][0][2]
#first [0]: retrieves pquery placements (p) from the [p , nm] list at index i
#second [0]: retrieves the first element in the list of potential placements (p) at index i
#the first placement listed has the highest posterior probability (this element contains the placement edge)
#[2]: the highest probability placement edge is the 3rd element in the placement information
placement_edge = i.values()[0][0][2]
#multiplicity is number of reads placed at that edge
multiplicity = i.values()[1]
numReads = len(multiplicity)
#check placement edge number against internal edge list
if placement_edge in internalEdges:
internal_count += 1
if placement_edge not in internalWithPlacement:
internalWithPlacement.append(placement_edge)
smallList = [placement_edge , numReads]
internalList.append(smallList)
#check placement edge number against leaf edge list
elif placement_edge in leafEdges:
external_count += 1
if placement_edge not in leafsWithPlacement:
leafsWithPlacement.append(placement_edge)
smallList = [placement_edge , numReads]
leafList.append(smallList)
newLeafList = [] #this will essentially be an array of [[edge_a, numSequences],[edge_b, numSequences]...etc] without repeating edge numbers
leafSeqSum = 0
for i in range(len(leafsWithPlacement)):
counter = 0 #keeps running tally of sequences placed at edge (leafsWithPlacements[i])
tempEdgeNum = 0
for j in range(len(leafList)):
if (leafList[j][0] == leafsWithPlacement[i]):# when the first element in leafList[j] equals the element in the list of edges with placements
tempEdgeNum = leafsWithPlacement[i]
counter += leafList[j][1]
leafSeqSum += leafList[j][1]
pair_edgeSeq = [tempEdgeNum, "leaf", counter]
newLeafList.append(pair_edgeSeq)
newInternalList = [] #this will essentially be an array of [[edge_a, numSequences],[edge_b, numSequences]...etc] without repeating edge numbers
internalSeqSum = 0
for i in range(len(internalWithPlacement)):
counter = 0 #keeps running tally of sequences placed at edge (leafsWithPlacements[i])
tempEdgeNum = 0
for j in range(len(internalList)):
if (internalList[j][0] == internalWithPlacement[i]):# when the first element in leafList[j] equals the element in the list of edges with placements
tempEdgeNum = internalWithPlacement[i]
counter += internalList[j][1]
internalSeqSum += internalList[j][1]
pair_edgeSeq = [tempEdgeNum, "internal", counter]
newInternalList.append(pair_edgeSeq)
allEdges = newLeafList + newInternalList
allEdges.sort(key=lambda x: x[0])
# with open("output.csv", "wb") as f:
# writer = csv.writer(f)
# writer.writerows(allEdges)
internal_placement_count += internalSeqSum
leaf_placement_count += leafSeqSum
total_placement_count += total_placements
#Write this to a tsv file?
print "Percent of Reads placed Interally" "\n" + str(float(internal_placement_count/total_placement_count))
print "Percent of Reads placed on leafs" "\n" + str(float(leaf_placement_count/total_placement_count))
print "Total number of placements" "\n" + str(total_placement_count)