-
Notifications
You must be signed in to change notification settings - Fork 8
/
data_prep_functions.py
625 lines (548 loc) · 32.7 KB
/
data_prep_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 18 14:07:26 2018
@author: bennett
"""
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from Bio.SeqUtils import IUPACData
# First define some prelimnary things
atom_names = ['HA', 'H', 'CA', 'CB', 'C', 'N']
sparta_results = [0.25, 0.49, 0.94, 1.14, 1.09, 2.45] # Advertised performance of SPARTA+
paper_order = ['Ala', 'Cys','Asp','Glu','Phe','Gly','His','Ile','Lys','Leu','Met','Asn','Pro','Gln','Arg','Ser','Thr','Val','Trp','Tyr'] # The amino acid order used in Sparta+ data and blosum matrix (1-letter alphabetic).
# The dictionary storing the hydrophobicity for different residues
# Literature: Wimley WC & White SH (1996). Nature Struct. Biol. 3:842-848.
hydrophobic_dict={'LYS': 1.81, 'GLN': 0.19, 'THR': 0.11, 'ASP': 0.5, 'GLU': 0.12, 'ARG': 1.0, 'LEU': -0.69, 'TRP': -0.24, 'VAL': -0.53,
'ILE': -0.81, 'PRO': -0.31, 'MET': -0.44, 'ASN': 0.43, 'SER': 0.33, 'ALA': 0.33, 'GLY': 1.14, 'TYR': 0.23, 'HIS': -0.06, 'PHE': -0.58, 'CYS': 0.22}
# For easier access, define the names of different feature columns.
# These are the names that we assign in our own feature extraction routine.
col_phipsi = [i+j for i in ['PHI_', 'PSI_'] for j in ['COS_i-1', 'SIN_i-1']]
col_phipsi += [i+j for i in ['PHI_', 'PSI_'] for j in ['COS_i', 'SIN_i']]
col_phipsi += [i+j for i in ['PHI_', 'PSI_'] for j in ['COS_i+1', 'SIN_i+1']]
col_chi = [i+j+k for k in ['_i-1', '_i', '_i+1'] for i in ['CHI1_', 'CHI2_'] for j in ['COS', 'SIN', 'EXISTS']]
col_hbprev = ['O_'+i+'_i-1' for i in ['d_HA', '_COS_H', '_COS_A', '_EXISTS']]
col_hbond = [i+j+'_i' for i in ['Ha_', 'HN_', 'O_'] for j in ['d_HA', '_COS_H', '_COS_A', '_EXISTS']]
col_hbnext = ['HN_'+i+'_i+1' for i in ['d_HA', '_COS_H', '_COS_A', '_EXISTS']]
col_s2 = ['S2'+i for i in ['_i-1', '_i', '_i+1']]
struc_cols = col_phipsi + col_chi + col_hbprev + col_hbond + col_hbnext + col_s2
blosum_names = ['BLOSUM62_NUM_'+paper_order[i].upper() for i in range(20)]
#blosum_names = ['BLOSUM62_NUM_'+sorted(IUPACData.protein_letters_3to1.keys())[i].upper() for i in range(20)]
col_blosum = [blosum_names[i]+j for j in ['_i-1', '_i', '_i+1'] for i in range(20)]
seq_cols = col_blosum
bin_seq_cols = ['BINSEQREP_'+ sorted(IUPACData.protein_letters_3to1.keys())[i].upper() + j for j in ['_i-1', '_i', '_i+1'] for i in range(20)]
rcoil_cols = ['RCOIL_' + atom for atom in atom_names]
ring_cols = [atom + '_RC' for atom in atom_names]
all_cols = struc_cols + seq_cols + rcoil_cols + ring_cols
all_cols_bin = struc_cols + bin_seq_cols + rcoil_cols + ring_cols
# These are the names of new columns that were not included in the SPARTA+ feature set
hsea_names = ['HSE_CA' + i for i in ['_U', '_D', '_Angle']]
hseb_names = ['HSE_CB' + i for i in ['_U', '_D']]
hse_cols = [name + i for i in ['_i-1', '_i', '_i+1'] for name in hsea_names + hseb_names]
dssp_ss_names = ['A_HELIX_SS', 'B_BRIDGE_SS', 'STRAND_SS', '3_10_HELIX_SS', 'PI_HELIX_SS', 'TURN_SS', 'BEND_SS', 'NONE_SS']
dssp_asa_names = ['REL_ASA', 'ABS_ASA']
dssp_pp_names = ['DSSP_PHI', 'DSSP_PSI']
dssp_hb_names = ['NH-O1_ENERGY', 'NH-O2_ENERGY', 'O-NH1_ENERGY', 'O-NH2_ENERGY']
dssp_cols = [name + i for i in ['_i-1', '_i', '_i+1'] for name in dssp_ss_names + dssp_asa_names + dssp_pp_names + dssp_hb_names]
dssp_energy_cols = [name + i for i in ['_i-1', '_i', '_i+1'] for name in dssp_hb_names]
dssp_expp_cols = [name + i for i in ['_i-1', '_i', '_i+1'] for name in dssp_ss_names + dssp_asa_names + dssp_hb_names]
dssp_ssi_cols = [name + '_i' for name in dssp_ss_names]
dssp_norm_cols = [name + i for i in ['_i-1', '_i', '_i+1'] for name in dssp_asa_names + dssp_hb_names]
dssp_pp_cols=[name + i for i in ['_i-1', '_i', '_i+1'] for name in dssp_pp_names]
# These are the names of the columns with sequence information beyond the tri-peptide level
ext_seq_cols = ['RESNAME_i' + i + str(j) for i in ['+', '-'] for j in range(1,21)]
# These column names are for renaming to make consistent between Michael's version
# of the naming and my own
sparta_ring_cols = [atom + '_RING' for atom in atom_names]
sparta_rcoil_cols = ["RC_" + atom for atom in atom_names]
sparta_rename_cols = sparta_ring_cols + sparta_rcoil_cols
sparta_rename_map = dict(zip(sparta_rename_cols, ring_cols + rcoil_cols))
sx2_rcoil_cols = ['RC_' + atom for atom in atom_names]
sx2_rename_map = dict(zip(sparta_ring_cols + sx2_rcoil_cols, ring_cols + rcoil_cols))
# Define the names for the original Sparta+ Data Columns (same set as above but different order so we can assign column labels to data obtained directly from Yang)
orig_cols = col_blosum[:20]
orig_cols += [i + j for i in ['PHI_', 'PSI_'] for j in ['SIN_i-1', 'COS_i-1']]
orig_cols += [i + j + '_i-1' for i in ['CHI1_', 'CHI2_'] for j in ['SIN', 'COS', 'EXISTS']]
orig_cols += col_blosum[20:40]
orig_cols += [i + j for i in ['PHI_', 'PSI_'] for j in ['SIN_i', 'COS_i']]
orig_cols += [i + j + '_i' for i in ['CHI1_', 'CHI2_'] for j in ['SIN', 'COS', 'EXISTS']]
orig_cols += col_blosum[40:]
orig_cols += [i + j for i in ['PHI_', 'PSI_'] for j in ['SIN_i+1', 'COS_i+1']]
orig_cols += [i + j + '_i+1' for i in ['CHI1_', 'CHI2_'] for j in ['SIN', 'COS', 'EXISTS']]
orig_cols += ['O_'+i+'_i-1' for i in ['_EXISTS', 'd_HA', '_COS_A', '_COS_H']]
orig_cols += [i+j+'_i' for i in ['HN_', 'Ha_', 'O_'] for j in ['_EXISTS', 'd_HA', '_COS_A', '_COS_H']]
orig_cols += ['HN_'+i+'_i+1' for i in ['_EXISTS', 'd_HA', '_COS_A', '_COS_H']]
orig_cols += ['S2'+i for i in ['_i-1', '_i', '_i+1']]
# These are the names of the boolean columns in the Sparta+ features that indicate existance of chi1/chi2 angles as well as possible hydrogen bonds
exist_cols = [i + 'EXISTS' + k for k in ['_i-1', '_i', '_i+1'] for i in ['CHI1_', 'CHI2_']]
exist_cols += ['O_'+ '_EXISTS' +'_i-1', 'HN_' + '_EXISTS' + '_i+1']
exist_cols += [i + '_EXISTS' + '_i' for i in ['Ha_', 'HN_', 'O_']]
# For completeness, here are the remaining columns
non_exist_cols = orig_cols.copy()
for x in exist_cols:
non_exist_cols.remove(x)
# These columns are multiplied by 10 in the original Sparta+ data
x10cols = [i + j for i in ['PHI_', 'PSI_'] for j in ['SIN_i-1', 'COS_i-1']]
x10cols += [i + j + '_i-1' for i in ['CHI1_', 'CHI2_'] for j in ['SIN', 'COS', 'EXISTS']]
x10cols += [i + j for i in ['PHI_', 'PSI_'] for j in ['SIN_i', 'COS_i']]
x10cols += [i + j + '_i' for i in ['CHI1_', 'CHI2_'] for j in ['SIN', 'COS', 'EXISTS']]
x10cols += [i + j for i in ['PHI_', 'PSI_'] for j in ['SIN_i+1', 'COS_i+1']]
x10cols += [i + j + '_i+1' for i in ['CHI1_', 'CHI2_'] for j in ['SIN', 'COS', 'EXISTS']]
# Make separate lists of cosine, sine, distance columns
phipsicos_cols = [i + 'COS_i-1' for i in ['PHI_', 'PSI_']]
phipsicos_cols += [i + 'COS_i' for i in ['PHI_', 'PSI_']]
phipsicos_cols += [i + 'COS_i+1' for i in ['PHI_', 'PSI_']]
chicos_cols = [i + 'COS' + k for k in ['_i-1', '_i', '_i+1'] for i in ['CHI1_', 'CHI2_']]
hbondcos_cols = ['O_'+i+'_i-1' for i in ['_COS_H', '_COS_A']]
hbondcos_cols += [i+j+'_i' for i in ['Ha_', 'HN_', 'O_'] for j in ['_COS_H', '_COS_A']]
hbondcos_cols += ['HN_'+i+'_i+1' for i in ['_COS_H', '_COS_A']]
cos_cols = phipsicos_cols + chicos_cols + hbondcos_cols
phipsisin_cols = [i + 'SIN_i-1' for i in ['PHI_', 'PSI_']]
phipsisin_cols += [i + 'SIN_i' for i in ['PHI_', 'PSI_']]
phipsisin_cols += [i + 'SIN_i-1' for i in ['PHI_', 'PSI_']]
chisin_cols = [i + 'SIN' + k for k in ['_i-1', '_i', '_i+1'] for i in ['CHI1_', 'CHI2_']]
sin_cols = phipsisin_cols + chisin_cols
# And a list for columns containing distance information
hbondd_cols = ['O_d_HA_i-1']
hbondd_cols += [i+'d_HA_i' for i in ['Ha_', 'HN_', 'O_']]
hbondd_cols += ['HN_d_HA_i+1']
# Need a list of columns that will be subject to noise if we wish to use noise injection
angle_cols = cos_cols + sin_cols
noisy_cols = angle_cols + hbondd_cols + col_s2
# Lets try some nonlinear functions of these columns
# Define columns to be squared
square_cols = cos_cols + hbondd_cols + col_s2
#square_cols_names = [x + '_sq' for x in square_cols]
square_cols_names = [x + '^2.0' for x in square_cols]
cols_pown1 = hbondd_cols
cols_pown2 = hbondd_cols
cols_pown3 = hbondd_cols
col_names_pown1 = [x + '^-1.0' for x in cols_pown1]
col_names_pown2 = [x + '^-2.0' for x in cols_pown2]
col_names_pown3 = [x + '^-3.0' for x in cols_pown3]
# Define the names for the _i+1 and _i-1 columns to allow dropping them for RNNs
im1_cols = ['PSI_'+i for i in ['COS_i-1', 'SIN_i-1']]
ip1_cols = ['PHI_'+i for i in ['COS_i+1', 'SIN_i+1']]
im1_cols += [i+j+'_i-1' for i in ['CHI1_', 'CHI2_'] for j in ['COS', 'SIN', 'EXISTS']]
ip1_cols += [i+j+'_i+1' for i in ['CHI1_', 'CHI2_'] for j in ['COS', 'SIN', 'EXISTS']]
im1_cols += col_hbprev
ip1_cols += col_hbnext
im1_cols += ['S2_i-1']
ip1_cols += ['S2_i+1']
im1_cols_bin = im1_cols.copy()
ip1_cols_bin = ip1_cols.copy()
im1_cols += ['BLOSUM62_NUM_'+list(IUPACData.protein_letters_3to1.keys())[i].upper() + '_i-1' for i in range(20)]
ip1_cols += ['BLOSUM62_NUM_'+list(IUPACData.protein_letters_3to1.keys())[i].upper() + '_i+1' for i in range(20)]
im1_cols_bin += ['BINSEQREP_'+ list(IUPACData.protein_letters_3to1.keys())[i].upper() + '_i-1' for i in range(20)]
ip1_cols_bin += ['BINSEQREP_'+ list(IUPACData.protein_letters_3to1.keys())[i].upper() + '_i+1' for i in range(20)]
# Some other columns
protein_letters=[code.upper() for code in IUPACData.protein_letters_3to1.keys()]
sp_feat_cols=['BLOSUM62_NUM_ALA_i-1', 'BLOSUM62_NUM_CYS_i-1', 'BLOSUM62_NUM_ASP_i-1', 'BLOSUM62_NUM_GLU_i-1', 'BLOSUM62_NUM_PHE_i-1',
'BLOSUM62_NUM_GLY_i-1', 'BLOSUM62_NUM_HIS_i-1', 'BLOSUM62_NUM_ILE_i-1', 'BLOSUM62_NUM_LYS_i-1', 'BLOSUM62_NUM_LEU_i-1', 'BLOSUM62_NUM_MET_i-1',
'BLOSUM62_NUM_ASN_i-1', 'BLOSUM62_NUM_PRO_i-1', 'BLOSUM62_NUM_GLN_i-1', 'BLOSUM62_NUM_ARG_i-1', 'BLOSUM62_NUM_SER_i-1', 'BLOSUM62_NUM_THR_i-1',
'BLOSUM62_NUM_VAL_i-1', 'BLOSUM62_NUM_TRP_i-1', 'BLOSUM62_NUM_TYR_i-1', 'PHI_SIN_i-1', 'PHI_COS_i-1', 'PSI_SIN_i-1', 'PSI_COS_i-1', 'CHI1_SIN_i-1',
'CHI1_COS_i-1', 'CHI1_EXISTS_i-1', 'CHI2_SIN_i-1', 'CHI2_COS_i-1', 'CHI2_EXISTS_i-1', 'BLOSUM62_NUM_ALA_i', 'BLOSUM62_NUM_CYS_i', 'BLOSUM62_NUM_ASP_i',
'BLOSUM62_NUM_GLU_i', 'BLOSUM62_NUM_PHE_i', 'BLOSUM62_NUM_GLY_i', 'BLOSUM62_NUM_HIS_i', 'BLOSUM62_NUM_ILE_i', 'BLOSUM62_NUM_LYS_i', 'BLOSUM62_NUM_LEU_i',
'BLOSUM62_NUM_MET_i', 'BLOSUM62_NUM_ASN_i', 'BLOSUM62_NUM_PRO_i', 'BLOSUM62_NUM_GLN_i', 'BLOSUM62_NUM_ARG_i', 'BLOSUM62_NUM_SER_i', 'BLOSUM62_NUM_THR_i',
'BLOSUM62_NUM_VAL_i', 'BLOSUM62_NUM_TRP_i', 'BLOSUM62_NUM_TYR_i', 'PHI_SIN_i', 'PHI_COS_i', 'PSI_SIN_i', 'PSI_COS_i', 'CHI1_SIN_i', 'CHI1_COS_i',
'CHI1_EXISTS_i', 'CHI2_SIN_i', 'CHI2_COS_i', 'CHI2_EXISTS_i', 'BLOSUM62_NUM_ALA_i+1', 'BLOSUM62_NUM_CYS_i+1', 'BLOSUM62_NUM_ASP_i+1',
'BLOSUM62_NUM_GLU_i+1', 'BLOSUM62_NUM_PHE_i+1', 'BLOSUM62_NUM_GLY_i+1', 'BLOSUM62_NUM_HIS_i+1', 'BLOSUM62_NUM_ILE_i+1', 'BLOSUM62_NUM_LYS_i+1',
'BLOSUM62_NUM_LEU_i+1', 'BLOSUM62_NUM_MET_i+1', 'BLOSUM62_NUM_ASN_i+1', 'BLOSUM62_NUM_PRO_i+1', 'BLOSUM62_NUM_GLN_i+1', 'BLOSUM62_NUM_ARG_i+1',
'BLOSUM62_NUM_SER_i+1', 'BLOSUM62_NUM_THR_i+1', 'BLOSUM62_NUM_VAL_i+1', 'BLOSUM62_NUM_TRP_i+1', 'BLOSUM62_NUM_TYR_i+1', 'PHI_SIN_i+1', 'PHI_COS_i+1',
'PSI_SIN_i+1', 'PSI_COS_i+1', 'CHI1_SIN_i+1', 'CHI1_COS_i+1', 'CHI1_EXISTS_i+1', 'CHI2_SIN_i+1', 'CHI2_COS_i+1', 'CHI2_EXISTS_i+1', 'O__EXISTS_i-1',
'O_d_HA_i-1', 'O__COS_A_i-1', 'O__COS_H_i-1', 'HN__EXISTS_i', 'HN_d_HA_i', 'HN__COS_A_i', 'HN__COS_H_i', 'Ha__EXISTS_i', 'Ha_d_HA_i', 'Ha__COS_A_i',
'Ha__COS_H_i', 'O__EXISTS_i', 'O_d_HA_i', 'O__COS_A_i', 'O__COS_H_i', 'HN__EXISTS_i+1', 'HN_d_HA_i+1', 'HN__COS_A_i+1', 'HN__COS_H_i+1', 'S2_i-1', 'S2_i',
'S2_i+1']
spartap_cols=sp_feat_cols+atom_names+[a+"_RC" for a in atom_names]+["FILE_ID","RESNAME","RES_NUM"]
col_square=["%s_%s_%s"%(a,b,c) for a in ['PHI','PSI'] for b in ['COS','SIN'] for c in ['i-1','i','i+1']]
dropped_cols=["DSSP_%s_%s"%(a,b) for a in ["PHI","PSI"] for b in ['i-1','i','i+1']]+["BMRB_RES_NUM","MATCHED_BMRB","CG","HA2_RING","HA3_RING","RCI_S2","identifier"]
col_lift=[col for col in sp_feat_cols if "BLOSUM" not in col and "_i-1" not in col and "_i+1" not in col]
non_numerical_cols=['3_10_HELIX_SS_i',"A_HELIX_SS_i","BEND_SS_i","B_BRIDGE_SS_i","CHI1_EXISTS_i","CHI2_EXISTS_i","HN__EXISTS_i","Ha__EXISTS_i","NONE_SS_i","O__EXISTS_i","PI_HELIX_SS_i","STRAND_SS_i","TURN_SS_i"]+protein_letters
# These are the names of the columns that are not in Sparta+ but are in the un-augmented features from our extraction
cols_notinsp = dssp_cols + hse_cols + ext_seq_cols
# Now some stuff to read in data
# Read in ShiftX2 data
# Athena --
try:
train_sx2 = pd.read_csv('/home/bennett/Documents/DataSets/bmrb_clean/clean_rings/train_shiftx2_clean_rings.csv')
test_sx2 = pd.read_csv('/home/bennett/Documents/DataSets/bmrb_clean/clean_rings/test_shiftx2_clean_rings.csv')
except FileNotFoundError:
pass
# Hermes
try:
train_sx2 = pd.read_csv('/home/kochise/data/shiftx2/clean_rings/train_shiftx2_clean_rings.csv')
test_sx2 = pd.read_csv('/home/kochise/data/shiftx2/clean_rings/test_shiftx2_clean_rings.csv')
except FileNotFoundError:
pass
# Office
try:
# train_sx2 = pd.read_csv('/Users/kcbennett/Documents/data/ShiftX2/bmrb_clean/train_shiftx2_clean_rings.csv')
# test_sx2 = pd.read_csv('/Users/kcbennett/Documents/data/ShiftX2/bmrb_clean/test_shiftx2_clean_rings.csv')
shiftx2_train = pd.read_csv('/Users/kcbennett/Documents/data/ShiftX2/sx2train_dssp.csv')
shiftx2_test = pd.read_csv('/Users/kcbennett/Documents/data/ShiftX2/sx2test_dssp.csv')
except FileNotFoundError:
pass
# Need to write a function to get targets as differences between raw shifts and the
# random coil and ring current values
def diff_targets(data, rings=False, coils=True, drop_cols=True):
'''Function that replaces the shifts column with the difference between the raw
shifts and the values in the columns given
data = Feature and target data (Pandas DataFrame)
rings = Subtract ring current columns from shift columns (Bool)
coils = Subtract random coil columns from shift columns (Bool)
drop_cols = Whether or not drop corresponding columns after obtaining the target difference
'''
df = data.copy()
if rings:
df[atom_names] = df[atom_names].values - df[ring_cols].fillna(0).values
if drop_cols:
df.drop(ring_cols, axis=1, inplace=True)
if coils:
df[atom_names] = df[atom_names].values - df[rcoil_cols].values
if drop_cols:
df.drop(rcoil_cols, axis=1, inplace=True)
return df
# Define feature augmentation functions
def feat_pwr(data, columns, pwrs):
'''Function to augment the feature set by adding new columns corresponding to given powers of selected features.
args:
data = Contains the data to be augmented (Pandas DataFrame)
columns = List of columns to be used for the feature augmentation (List of Str)
pwrs = List of exponents to use for feature augmentation (List of Float)
returns:
dat - Augmented data (Pandas DataFrame)
'''
dat = data.copy()
for col in columns:
for power in pwrs:
new_col_name = col + '^' + str(power)
if power < 0:
dat[new_col_name] = 0
dat.loc[dat[col] > 0, new_col_name] = dat.loc[dat[col] > 0, col]**power
else:
dat[new_col_name] = dat[col]**power
return dat
def Add_res_spec_feats(dataset,include_onehot=True):
'''
Adding residue specific features into the dataset (only for current residue), including one-hot representation of the residue,
and the hydrophobicity of the residue
'''
if include_onehot:
for code in protein_letters:
dataset[code]=[int(res==code) for res in dataset['RESNAME']]
dataset["HYDROPHOBICITY"]=[hydrophobic_dict[res] for res in dataset['RESNAME']]
# Feature space lifting
def Lift_Space(dataset,participating_cols,increased_dim,w,b):
if w is None:
w = np.random.normal(0, 0.1, (len(participating_cols), increased_dim))
b = np.random.uniform(0, 2*np.pi, increased_dim)
lifted_dat_mat=np.cos(dataset[participating_cols].values.dot(w)+b)
for n in range(increased_dim):
dataset["Lifted_%d"%n]=lifted_dat_mat[:,n]
def encode_onehot(sequence_feat):
'''
Encode the sequence features into one-hot representation.
sequence_feat: The sequence features that has length of total number of examples (N) and each element
is a list of residue names with length (L) (List/DataFrame)
Returns a three-dimensional numpy array that has shape (N,L,20)
'''
RES_CODES=sorted([code.upper() for code in IUPACData.protein_letters_3to1.keys()])
RES_CODES_DICT={code:idx for idx,code in enumerate(RES_CODES)}
onehot=np.zeros((len(sequence_feat),len(sequence_feat[0]),20))
for idx_seq,seq in enumerate(sequence_feat):
for idx_res,amino_acid in enumerate(seq):
if amino_acid in RES_CODES:
onehot[idx_seq][idx_res][RES_CODES_DICT[amino_acid]]=1
return onehot
# Adding classification results into features
def Implant_classification(dataset,model,half_window=20):
print("Predicting using classification model...")
input_cols=["RESNAME_i-%d"%n for n in range(half_window,0,-1)]+["RESNAME"]+["RESNAME_i+%d"%n for n in range(1,half_window+1)]
input_classification=encode_onehot(dataset[input_cols].values)
pred=model.predict(input_classification,verbose=1)
class_names=["CLASS_"+str(i+1) for i in range(pred[0].shape[1])]
for col in class_names:
dataset[col]=0
dataset[class_names]=pred[0]
def filter_outlier(data,atom,outlier_path="./"):
'''
Function to filter outliers from training and validation data based on PDBID and RESNUM. The outliers were selected out by 10 fold cross validation with sparta+ network with out-of-std predictions
'''
data["identifier"]=data["FILE_ID"].astype(str)+data["RES_NUM"].map(str)
outlier=pd.read_csv(outlier_path+"filtered_"+atom+".csv")
for i in range(len(outlier)):
identifier=outlier.iloc[i]["PDB"]+str(outlier.iloc[i]["RESNUM"])
resname=outlier.iloc[i]["RESNAME"]
filtered=data[data["identifier"]==identifier]
assert len(filtered)<=1
if len(filtered)==1:
# this item need to be filtered
idx=filtered.iloc[0].name
assert data.loc[idx,"RESNAME"]==resname
data.loc[idx,atom]=np.nan
if((i+1)%100==0):
print("%d/%d"%(i+1,len(outlier)),end="\r")
data.drop("identifier",axis=1,inplace=True)
# Define purifier functions
def dihedral_purifier(data, tol=0.001, drop_cols=True, set_nans=True):
'''Function to purify given data based on eliminating those residues for which Biopython and DSSP disagree on dihedral angles by more than tol in Cos value for Phi_i.
args:
data - Data to be purified (Pandas DataFrame)
tol - Tolerance for dihedral angle Cos values (Float)
drop_cols - Drop columns associated with DSSP PHI/PSI angles (Bool)
set_nans - Replace all shifts with NaN in filtered rows rather than removing said rows (Bool)
returns:
dat - Data with examples removed if Biopython and DSSP disagree by too much (Pandas DataFrame)
'''
dat = data.copy()
phipsidev = np.abs(dat['PHI_COS_i'] - np.cos(np.pi / 180 * dat['DSSP_PHI_i'].astype(np.float64).values))
good_idxs = ((dat['PHI_COS_i'] == 0) & (dat['PHI_SIN_i'] == 0)) | (phipsidev == 1) | (phipsidev <= 0.001)
if set_nans:
dat.loc[~good_idxs, atom_names] = np.nan
else:
dat = dat.loc[good_idxs]
if drop_cols:
dat = dat.drop(['DSSP_' + i + j for i in ['PHI', 'PSI'] for j in ['_i-1', '_i', '_i+1']], axis=1)
return dat
def dssp_purifier(data, set_nans=True):
'''Function to remove examples where DSSP fails to classify the secondary structure.
args:
data - Given data to be filtered (Pandas DataFrame)
set_nans - Replace all shifts with NaN in filtered rows rather than removing said rows (Bool)
returns:
dat - Filtered data (Pandas DataFrame)
'''
dat = data.copy()
bad_idxs = dat[(dat[dssp_ssi_cols[0]]==0) & (dat[dssp_ssi_cols[1]]==0) & (dat[dssp_ssi_cols[2]]==0) & (dat[dssp_ssi_cols[3]]==0) & (dat[dssp_ssi_cols[4]]==0) & (dat[dssp_ssi_cols[5]]==0) & (dat[dssp_ssi_cols[6]]==0) & (dat[dssp_ssi_cols[7]]==0)].index
if set_nans:
dat.loc[bad_idxs, atom_names] = np.nan
else:
dat = dat.drop(bad_idxs)
return dat
def medianize(data, cols, medians=None):
'''Function to replace zeros by medians for specified columnns.
args:
data - Data containing columns with zeros to be replaced (Pandas DataFrame)
cols - Names of columns with zeros that are to be replaced (List)
medians - Explicit list of medians to use for replacement in testing data (List)
returns:
meds - Medians used for replacement (List)
dat - Data with zeros replaced by medians (Pandas DataFrame)
'''
dat = data.copy()
if medians is not None:
meds = medians
else:
meds = [dat.loc[dat[col] != 0, col].median() for col in cols]
med_dict = dict(zip(cols, meds))
for col in cols:
dat.loc[(dat[col] == 0), col] = med_dict[col]
return meds, dat
def rc_fix(data,use_null=False):
'''
Function to replace zeroes in random coils into some more reasonable values
use_null - If true, set null to those values that have unknown random coil values
'''
if use_null:
checklist={"N":["PRO",np.nan],"H":["PRO",np.nan],"CB":["GLY",np.nan]}
else:
checklist={"N":["PRO",119.5],"H":["PRO",8.23],"CB":["GLY",36.8]}
for atom in atom_names:
zero_indices=data[data["RCOIL_"+atom]==0].index
for idx in zero_indices:
assert data.loc[idx,"RESNAME"]==checklist[atom][0]
data.loc[idx,"RCOIL_"+atom]=checklist[atom][1]
def ha23ambigfix(data, mode=0):
'''Function to use the HA2/HA3 ring current calculations in a way that resolves ambiguity.
args:
data - Data to fix (Pandas DataFrame)
mode - Method of resolving ambiguity, 0 -> use average, 1 -> use HA2, 2 -> use HA3 (Int)
returns:
dat - Data with ring current ambiguity resolved and extra columns dropped
'''
dat = data.copy()
idx_ha = [i for i in list(dat[dat['HA2_RING'].notnull()].index)]
if mode==0:
for i in idx_ha:
dat.loc[i, 'HA_RC'] = (dat.loc[i, 'HA2_RING'] + dat.loc[i, 'HA3_RING']) / 2
elif mode==1:
for i in idx_ha:
dat.loc[i, 'HA_RC'] = dat.loc[i, 'HA2_RING']
elif mode==2:
for i in idx_ha:
dat.loc[i, 'HA_RC'] = dat.loc[i, 'HA3_RING']
dat = dat.drop(['HA2_RING', 'HA3_RING'], axis=1)
return dat
def check_nan_shifts(data, thr, ends=False):
'''Function to check the number of NaN shifts in each row and, for each residue with fewer than thr non-NaN shifts, sets all shifts for that residue as well as neighbors to be NaN so that those residues will be excluded from training/evaluation
args:
data - Data to be cleansed (Pandas DataFrame)
thr - Number of non-NaN shifts required for a residue and neighbors to survive (Int)
ends - Set shifts of atoms on first and last residue of each chain to be NaN as well (Bool)
returns:
dat - Cleaned data (Pandas DataFrame)
'''
dat = data.copy()
for idx in range(len(dat)):
snums = dat.loc[idx, atom_names].count()
file = dat.loc[idx, 'PDB_FILE_NAME']
chain = dat.loc[idx, 'CHAIN']
if snums < thr:
dat.loc[idx, atom_names] = np.nan
if idx > 0:
file_m1 = dat.loc[idx - 1, 'PDB_FILE_NAME']
chain_m1 = dat.loc[idx - 1, 'CHAIN']
if (file == file_m1) and (chain == chain_m1):
dat.loc[idx - 1, atom_names] = np.nan
if idx < (len(dat) -1):
file_p1 = dat.loc[idx + 1, 'PDB_FILE_NAME']
chain_p1 = dat.loc[idx + 1, 'CHAIN']
if (file == file_m1) and (chain == chain_m1):
dat.loc[idx + 1, atom_names] = np.nan
if ends:
if (idx == 0) or (idx == len(dat) -1):
dat.loc[idx, atom_names] = np.nan
elif (file != file_m1) or (chain == chain_m1):
dat.loc[idx, atom_names] = np.nan
dat.loc[idx - 1, atom_names] = np.nan
elif (file != file_p1) or (chain == chain_p1):
dat.loc[idx, atom_names] = np.nan
dat.loc[idx + 1, atom_names] = np.nan
return dat
def raw_dprep(data, ha23mode=0, power_dict={2.0 : square_cols, -1.0 : hbondd_cols, -2.0 : hbondd_cols, -3.0 : hbondd_cols}, diff_rings=False):
'''Function to prepare freshly-extracted data.
args:
data - Data to be prepared (Pandas DataFrame)
ha23mode - Method of resolving ambiguity for HA2/3; see ha23ambigfix function (Int, 0 or 1)
power_dict - Dictionary where keys are powers and corresponding values are lists of the column names. These columns are raised to the corresonding power and the result is stored as new columns, augmenting the feature set (Dict)
diff_rings - Difference the ring current contributions from the shifts. Only use this if not doing kfold with the result since it requires that we pre-medianize the ring current contributions to eliminate NaN values and in principle this should be done at each fold separately (Bool)
returns:
dat - Cleaned data ready to be inserted into a kfold_crossval or model-generating function
'''
dat = data.rename(index=str, columns=sx2_rename_map)
dat.index = pd.RangeIndex(start=0, stop=len(dat), step=1)
dat = ha23ambigfix(dat, mode=ha23mode)
try: # If DSSP columns are available, use them to filter a few bad residues based on the criteria in dihedral_purifier and dssp_purifier
dat = dihedral_purifier(dat, drop_cols=True)
dat = dssp_purifier(dat)
except KeyError:
pass
for pwr in power_dict.keys(): # Augment the feature set with a few non-linear transformations
dat = feat_pwr(dat, power_dict[pwr], [pwr])
# It is assumed that random coil columns are contained in the data
if diff_rings:
for col in ring_cols:
null_idxs = [int(i) for i in list(dat[dat[col].isnull()].index)]
dat.loc[null_idxs, col] = dat[col].median()
dat = diff_targets(dat, rings=True, coils=True)
else:
dat = diff_targets(dat, rings=False, coils=True)
dat.index = pd.RangeIndex(start=0, stop=len(dat), step=1)
return dat
def sx2_dprep(train_data, test_data, ha23mode=0, med_cols=angle_cols+hbondd_cols, add_squares=square_cols, med_rings=False, diff_rings=False, medians=True, normalize=False):
'''Function to completely prepare test and train data from shiftx2 extraction.
args:
train_data = Training data to prepare (Pandas DataFrame)
test_data = Test data to prepare (Pandas DataFrame)
ha23mode = Mode for resolving HA2/HA3 ring current ambiguity (0, 1, or 2 -- See ha23ambigfix function)
returns:
train_dat - Prepared training data (Pandas DataFrame)
test_dat - Prepared test data (Pandas DataFrame)
'''
train_dat = train_data.rename(index=str, columns=sx2_rename_map)
test_dat = test_data.rename(index=str, columns=sx2_rename_map)
train_dat.index = pd.RangeIndex(start=0, stop=len(train_dat), step=1)
test_dat.index = pd.RangeIndex(start=0, stop=len(test_dat), step=1)
train_dat = dihedral_purifier(train_dat)
test_dat = dihedral_purifier(test_dat)
train_dat = dssp_purifier(train_dat)
test_dat = dssp_purifier(test_dat)
if medians:
train_medians, train_dat = medianize(train_dat, med_cols)
_, test_dat = medianize(test_dat, med_cols, train_medians)
train_dat = ha23ambigfix(train_dat, mode=ha23mode)
test_dat = ha23ambigfix(test_dat, mode=ha23mode)
# Fill in ring columns with medians
if med_rings:
for col in ring_cols:
null_idxs_train = [int(i) for i in list(train_dat[train_dat[col].isnull()].index)]
null_idxs_test = [int(i) for i in list(test_dat[test_dat[col].isnull()].index)]
train_dat.loc[null_idxs_train, col] = train_dat[col].median()
test_dat.loc[null_idxs_test, col] = train_dat[col].median()
# Add square columns
if add_squares is not None:
train_dat = featsq(train_dat, add_squares)
test_dat = featsq(test_dat, add_squares)
# Difference the raw shifts with random coil (and maybe ring current) contributions
train_dat = diff_targets(train_dat, rings=diff_rings, coils=True)
test_dat = diff_targets(test_dat, rings=diff_rings, coils=True)
# Normalize all columns
if normalize:
sx2_means, sx2_stds, train_dat = whitener(struc_cols + col_blosum + ring_cols + hse_cols + dssp_norm_cols, train_dat)
_, _, test_dat = whitener(struc_cols + col_blosum + ring_cols + hse_cols + dssp_norm_cols, test_dat, means=sx2_means, stds=sx2_stds, test_data=True)
train_dat.index = pd.RangeIndex(start=0, stop=len(train_dat), step=1)
test_dat.index = pd.RangeIndex(start=0, stop=len(test_dat), step=1)
return train_dat, test_dat
def sp_dprep(data, ha23mode=0, spfeats_only=False, med_rings=True, add_squares=square_cols, diff_rings=False):
'''Function to prepare sparta+ data.'''
dat = data.rename(index=str, columns=sx2_rename_map)
dat.index = pd.RangeIndex(start=0, stop=len(dat), step=1)
dat = dihedral_purifier(dat, drop_cols=not spfeats_only)
dat = dssp_purifier(dat)
if spfeats_only:
dat = dat.drop(dssp_cols + hse_cols + ext_seq_cols, axis=1)
else:
dat = featsq(dat, square_cols)
dat = ha23ambigfix(dat, mode=ha23mode)
if med_rings:
for col in ring_cols:
null_idxs = [int(i) for i in list(dat[dat[col].isnull()].index)]
dat.loc[null_idxs, col] = dat[col].median()
dat = diff_targets(dat, rings=diff_rings, coils=True)
dat.index = pd.RangeIndex(start=0, stop=len(dat), step=1)
return dat
def hbond_purifier(data, ang_tol=None, drop_phi=False):
'''Function to filter data by H-bond angles or to drop Phi angles.
args:
data - Data to be filtered (Pandas DataFrame)
ang_tol - Tolerance for cosine angle filtration about 180 degrees (None or Float)
drop_phi - Whether or not to drop the Phi angles (Bool)
returns:
dat - Filtered data (Pandas DataFrame)'''
dat = data.copy()
if ang_tol is not None:
cos_tol = np.cos((180 - ang_tol) * np.pi/180)
hyd_angs = [i + '__COS_H_' + j for i in ['HN', 'Ha', 'O'] for j in ['i-1', 'i', 'i+1']]
for col in hyd_angs:
if col in dat.columns:
dat = dat[(dat[col] < cos_tol) | (dat[col] == 0)]
if drop_phi:
phi_angs = [i + '__COS_A_' + j for i in ['HN', 'Ha', 'O'] for j in ['i-1', 'i', 'i+1']]
for col in phi_angs:
if col in dat.columns:
dat = dat.drop(col, axis=1)
return dat
# These few lines are to check chi angles existence for those residues with alt chi definitions
def get_chis(data, resname):
#resname_dict = {'ASN' : [-2, -3, 1, 0, -3], 'HIS' : [-2, -3, 1, 0, -1], 'TRP' : [-3, -2, -4, -3, 1], 'ASP' : [-2, -3, 6, 2, -3], 'LEU' : [-1, -1, -4, -3, 0], 'PHE' : [-2, -2, -3, -3, 6], 'TYR' : [-2, -2, -3, -2, 3], 'VAL' : [0, -1, -3, -2, -1], 'ILE' : [-1, -1, -3, -3, 0]}
blos = resname_blos_dict[resname]
if resname == 'VAL':
chi_name = 'CHI1_EXISTS_i'
else:
chi_name = 'CHI2_EXISTS_i'
dat = data[(data['BLOSUM62_NUM_ALA_i'] == blos[0]) & (data['BLOSUM62_NUM_CYS_i'] == blos[1]) & (data['BLOSUM62_NUM_ASP_i'] == blos[2]) & (data['BLOSUM62_NUM_GLU_i'] == blos[3]) & (data['BLOSUM62_NUM_PHE_i'] == blos[4])]
return dat[chi_name]
def chi2_exist_nums(data, resname):
'''returns total number of residues with given resname as well as number of these residues for which chi2 does not exist'''
chis = get_chis(data, resname)
nres = len(chis)
nchis = len(chis[chis < 9])
return (nres, nchis)
bestH_fckwargs = {'activ' : 'prelu', 'lrate' : 0.001, 'mom' : [0.9, 0.999], 'dec' : 1*10**-6, 'epochs' : 500, 'min_epochs' : 50, 'opt_type' : 'adam', 'do' : 0.4, 'reg' : 0*10**-5,
'reg_type' : 'L2', 'tol' : 4, 'early_stop' : None, 'bnorm' : True, 'nest' : True, 'lsuv' : True, 'clip_norm' : 0.0, 'clip_val' : 0.0, 'reject_outliers' : None,
'opt_override' : True, 'noise' : None, 'noise_type': 'angle', 'noise_dist' : 'uniform', 'batch_size' : 64, 'lsuv_batch' : 2048}
H_fckwargs = {'activ' : 'tanh', 'lrate' : 0.0001, 'mom' : [0.9, 0.999], 'dec' : 1*10**-6, 'epochs' : 450, 'min_epochs' : 50, 'opt_type' : 'sgd', 'do' : 0.0, 'reg' : 1*10**-6,
'reg_type' : 'L2', 'tol' : 3, 'pretrain' : 'PQ', 'bnorm' : False, 'nest' : True, 'lsuv' : False, 'clip_norm' : 0.0, 'clip_val' : 0.0, 'reject_outliers' : None,
'opt_override' : False, 'noise' : None, 'noise_type': 'percent', 'noise_dist' : 'normal', 'batch_size' : 64, 'lsuv_batch' : 2048}