-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_pombase_model.py
170 lines (145 loc) · 8.27 KB
/
generate_pombase_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# from generate_rdf import GoCamModel, Annoton
from gocamgen.gocamgen import GoCamModel, Annoton
from gaf_query import genes_and_annots_for_bp, NoCacheEagerRemoteSparqlOntology
from pombase_golr_query import GeneConnectionSet
from rdflib.term import URIRef
from ontobio.vocabulary.relations import OboRO
from prefixcommons.curie_util import expand_uri
import argparse
ro = OboRO()
ENABLED_BY = URIRef(expand_uri(ro.enabled_by))
PART_OF = URIRef(expand_uri(ro.part_of))
OCCURS_IN = URIRef(expand_uri(ro.occurs_in))
class PomBaseAnnoton(Annoton):
def __init__(self, gene_info, subject_id):
self.enabled_by = subject_id
self.molecular_function = self.get_aspect_object(gene_info, "molecular_function")
self.cellular_component = self.get_aspect_object(gene_info, "cellular_component")
self.biological_process = self.get_aspect_object(gene_info, "bp")
self.connections = gene_info["connections"]
self.individuals = {}
#TODO: Should be replaced with gene_info.get(key)
def get_aspect_object(self, gene_info, aspect):
if aspect in gene_info:
return gene_info[aspect]
parser = argparse.ArgumentParser()
parser.add_argument('-t', "--bp_term", type=str, required=True,
help="Biological process GO term that GOCAM should model")
parser.add_argument('-f', "--filename", type=str, required=False,
help="Destination filename - will end in '.ttl'")
parser.add_argument('-d', "--directory", type=str, required=False,
help="Destination directory - in this case titles will be auto-generated")
parser.add_argument('-g', "--gaf_source", type=str, required=True,
help="filename of GAF file to use as annotation source")
parser.add_argument('-j', "--tad_json", type=str, required=False,
help="Existing json data file to load into term-to-gene dictionary. Speeds up performance.")
def main():
args = parser.parse_args()
generate_model(args.bp_term, args.gaf_source, args.filename, args.tad_json, args.directory)
def generate_model(bp_term, gaf_source, filename=None, tad_json=None, directory=None):
bp = bp_term
go = NoCacheEagerRemoteSparqlOntology("go")
if not go.has_node(bp):
raise "Invalid BP term {}".format(bp)
gene_info = genes_and_annots_for_bp(bp, gaf_source, json_file=tad_json)
# chosen_annots = get_associations_chosen_in_previous_line(params)
model_title = "PomBase - " + bp + " - " + go.label(bp)
if filename is None or directory is not None:
# filename = model_title.replace(" - ", "_").replace(":", "_").replace(" ", "_")
filename = model_title
for unwanted_char in [" - ", ":", " ", "/"]:
filename = filename.replace(unwanted_char, "_")
model = GoCamModel(model_title)
model.writer.bp_id = model.declare_individual(bp)
annotons = []
for gene in gene_info:
annoton = PomBaseAnnoton(gene_info[gene], gene)
if annoton.molecular_function is not None:
annotons.append(annoton)
global_individuals_list = {}
# translate lists of annotations
for annoton in annotons:
# Class
if annoton.enabled_by not in model.classes:
model.declare_class(annoton.enabled_by)
# writer.translate_annoton(annoton)
sub = annoton.enabled_by
obj = annoton.molecular_function["object"]
# E.g. instance of gene product class. Keeping individuals at annoton level for our Pombase purposes.
if sub not in annoton.individuals:
enabler_id = model.declare_individual(sub)
annoton.individuals[sub] = enabler_id
else:
enabler_id = annoton.individuals[sub]
# E.g. instance of GO class
if obj["id"] not in annoton.individuals:
tgt_id = model.declare_individual(obj["id"])
annoton.individuals[obj["id"]] = tgt_id
else:
tgt_id = annoton.individuals[obj["id"]]
enabled_by_stmt = model.writer.emit(tgt_id, ENABLED_BY, enabler_id)
part_of_stmt = model.writer.emit(tgt_id, PART_OF, model.writer.bp_id)
axiom_id = model.add_axiom(enabled_by_stmt)
association = annoton.molecular_function
model.add_evidence(axiom_id, association["evidence"]["type"], association["evidence"]["has_supporting_reference"])
# Record annotation
if annoton.cellular_component is not None:
for cellular_component in annoton.cellular_component:
cc_object_id = cellular_component["object"]["id"]
if cc_object_id not in annoton.individuals:
cc_id = model.declare_individual(cc_object_id)
annoton.individuals[cc_object_id] = cc_id
occurs_in_stmt = model.writer.emit(tgt_id, OCCURS_IN, cc_id)
cc_axiom_id = model.add_axiom(occurs_in_stmt)
cc_association = cellular_component
model.add_evidence(cc_axiom_id, cc_association["evidence"]["type"], cc_association["evidence"]["has_supporting_reference"])
# Record annotation
global_individuals_list = {**global_individuals_list, **annoton.individuals}
### Connections - Now that all individuals should have been created
global_connections_list = GeneConnectionSet() # Tracking what connections have been added so far
for annoton in annotons:
for connection in annoton.connections.gene_connections:
if connection.relation in ["has input"]:
model.add_connection(connection, annoton)
# Record annotation
global_connections_list.append(connection)
elif connection.relation in ["has_regulation_target", "regulates_activity_of", "directly_positively_regulates", "directly_negatively_regulates", "has_direct_input"]:
if connection.relation == "has_direct_input":
connection.relation = "directly_regulates"
# source_id = annoton.individuals[connection.gp_a]
try:
source_id = annoton.individuals[connection.object_id]
except KeyError:
print("Say hey")
# New activity to declare for regulating annoton - also setup enabled_by triple and set source_id from MF
source_id = model.declare_individual(connection.object_id)
model.writer.emit(source_id, ENABLED_BY, annoton.individuals[connection.gp_a])
model.add_axiom((source_id, ENABLED_BY, annoton.individuals[connection.gp_a]))
annoton.individuals[connection.object_id] = source_id
# Probably need to switch source to be object (GO MF) of connection
property_id = URIRef(expand_uri(model.connection_relations[connection.relation]))
# find annoton(s) of regulation target gene product
target_annotons = model.writer.find_annotons(connection.gp_b, annotons)
for t_annoton in target_annotons:
mf_annotation = t_annoton.get_aspect_object(gene_info[connection.gp_b], "molecular_function")
if mf_annotation is not None:
# target_id = global_individuals_list[mf_annotation["object"]["id"]]
target_id = t_annoton.individuals[mf_annotation["object"]["id"]]
# Annotate source MF GO term NamedIndividual with relation code-target MF term URI
model.writer.emit(source_id, property_id, target_id)
# Add axiom (Source=MF term URI, Property=relation code, Target=MF term URI)
model.writer.emit_axiom(source_id, property_id, target_id)
# Record mf_annotation
global_connections_list.append(connection)
# Now see if the with connections can fill anything in
for annoton in annotons:
for connection in annoton.connections.gene_connections:
if connection.relation in ["with_support_from"] and not global_connections_list.contains(connection):
model.add_connection(connection, annoton)
# Record annotation
if directory is not None:
filename = directory + filename
model.write(filename)
return model
if __name__ == "__main__":
main()