-
Notifications
You must be signed in to change notification settings - Fork 1
/
split_multimol2.py
141 lines (108 loc) · 4 KB
/
split_multimol2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#Python2 or Python3
#AspirinCode 2018
#Script that splits a multi-mol2 file into individual mol2 files.
#python split_multimol2.py multi-mol2.mol2 out_dir
import sys
import os
def split_multimol2(multimol2):
"""
Splits a multi-mol2 file.
Parameters
----------
multimol2 : str
Path to the multi-mol2 file.
Returns
----------
A generator object for lists for every extracted mol2-file. Lists contain
the molecule ID and the mol2 file contents.
e.g., ['ID1234', '@<TRIPOS>MOLECULE...'
"""
with open(multimol2, 'r') as mol2file:
line = mol2file.readline()
while not mol2file.tell() == os.fstat(mol2file.fileno()).st_size:
if line.startswith("@<TRIPOS>MOLECULE"):
mol2cont = []
mol2cont.append(line)
line = mol2file.readline()
molecule_id = line.strip()
while not line.startswith("@<TRIPOS>MOLECULE"):
mol2cont.append(line)
line = mol2file.readline()
if mol2file.tell() == os.fstat(mol2file.fileno()).st_size:
mol2cont.append(line)
break
mol2cont[-1] = mol2cont[-1].rstrip() # removes blank line at file end
yield [molecule_id, "".join(mol2cont)]
def write_multimol2(multimol2, out_dir):
"""
Splits a multi-mol2 file into smaller multi-mol2 files.
Parameters
-----------
multimol2 : str
Path to the multi-mol2 file.
out_dir : str:
Output directory. New files will be named
<molecule_name_1>.mol2, ... <molecule_name_n>.mol2
Returns
-----------
chunks : int
Number of files written.
"""
if not out_dir:
os.mkdir(out_dir)
single_mol2s = split_multimol2(args.MOL2_FILE)
for mol2 in single_mol2s:
out_mol2 = os.path.join(args.OUT_DIR, mol2[0]) + '.mol2'
with open(out_mol2, 'w') as out_file:
for line in mol2[1]:
out_file.write(line)
out_file.write('\n')
def write_multimol2_chunks(multimol2, chunk_size, out_dir):
"""
Splits a multi-mol2 file into smaller multi-mol2 files.
Parameters
-----------
multimol2 : str
Path to the multi-mol2 file.
chunksize : int
Number of mol2 files per chunk.
out_dir : str:
Output directory. New files will be named
<multimol2>_1.mol2, ... <multimol2>_n.mol2
Returns
-----------
chunks : int
Number of files written.
"""
if not os.path.exists(out_dir):
os.mkdir(out_dir)
out_path_stem = os.path.dirname(multimol2)
out_file_stem = os.path.basename(multimol2).split('.mol2')[0]
cnt = 0
chunks = 1
out_file = open(os.path.join(out_dir, out_file_stem)+'_%d.mol2' % chunks, 'w')
for mol2 in split_multimol2(multimol2):
cnt += 1
if cnt == chunk_size:
cnt = 0
chunks += 1
out_file.close()
out_file = open(os.path.join(out_dir, out_file_stem)+'_%d.mol2' % chunks, 'w')
out_file.write(mol2[1] + '\n')
out_file.close()
return chunks
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description='Splits a multi-mol2 file into individual mol2 files',
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('MOL2_FILE')
parser.add_argument('OUT_DIR')
parser.add_argument('-c', '--chunksize', help='Number of MOL2 structures per file (1 by default)', type=int)
parser.add_argument('-v', '--version', action='version', version='split_multimol2 v. 1.1')
args = parser.parse_args()
if args.chunksize:
write_multimol2_chunks(multimol2=args.MOL2_FILE, chunk_size=args.chunksize, out_dir=args.OUT_DIR)
else:
write_multimol2(multimol2=args.MOL2_FILE, out_dir=args.OUT_DIR)