-
Notifications
You must be signed in to change notification settings - Fork 0
/
utility.py
92 lines (83 loc) · 4.34 KB
/
utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from rdkit import Chem
from rdkit.Chem import rdmolops
import torch
import numpy as np
# 处理分子图,将 SMILES 字符串转换为分子图表示
def mol_to_graph(smiles):
mol = Chem.MolFromSmiles(smiles) # 从 SMILES 字符串生成分子对象
if mol is None:
return None
atom_features = []
for atom in mol.GetAtoms(): # 遍历分子中的每个原子,提取特征
atom_features.append([
atom.GetAtomicNum(), # 原子序数
atom.GetDegree(), # 原子的键数
atom.GetTotalNumHs(), # 原子连接的氢原子数
atom.GetImplicitValence(), # 隐含价
atom.GetIsAromatic() # 是否芳香性
])
adjacency_matrix = rdmolops.GetAdjacencyMatrix(mol) # 获取分子的邻接矩阵
return {'atoms': atom_features, 'adjacency_matrix': adjacency_matrix}
# 将蛋白质序列转换为 N-Gram 序列
def sequence_to_ngram(sequence, n=3):
ngrams = [sequence[i:i+n] for i in range(len(sequence)-n+1)] # 生成长度为 n 的 N-Gram 序列
return ngrams
# 处理蛋白质序列,将其转换为 N-Gram 分子图表示
def process_sequence(sequence, max_length=1000, ngram_size=3):
if len(sequence) > max_length: # 如果序列长度超过最大长度,进行截断;否则进行填充
sequence = sequence[:max_length]
else:
sequence = sequence.ljust(max_length, '0')
ngram_graphs = []
ngrams = sequence_to_ngram(sequence, ngram_size)
for ngram in ngrams: # 遍历每个 N-Gram 序列,生成分子对象并提取特征
mol = Chem.MolFromSequence(ngram) # 使用 Chem.MolFromSequence 处理蛋白质片段
if mol:
atom_features = []
for atom in mol.GetAtoms():
atom_features.append([
atom.GetAtomicNum(), # 原子序数
atom.GetDegree(), # 原子的键数
atom.GetTotalNumHs(), # 原子连接的氢原子数
atom.GetImplicitValence(), # 隐含价
atom.GetIsAromatic() # 是否芳香性
])
adjacency_matrix = rdmolops.GetAdjacencyMatrix(mol) # 获取分子的邻接矩阵
ngram_graphs.append({'atoms': atom_features, 'adjacency_matrix': adjacency_matrix})
return ngram_graphs
# 处理 SMILES 字符串和蛋白质序列,生成分子图和 N-Gram 图
def process_data(smiles, sequence, affinity, max_length=1000, ngram_size=3):
mol_graph = mol_to_graph(smiles) # 处理分子图
ngram_graphs = process_sequence(sequence, max_length, ngram_size) # 处理 N-Gram 图
affinity = float(affinity) # 将标签转换为浮点数
return mol_graph, ngram_graphs, affinity
# 将图转换为张量格式
def tensors_from_graph(graph):
atoms = torch.tensor(graph['atoms'], dtype=torch.float) # 将原子特征转换为张量
adjacency_matrix = torch.tensor(graph['adjacency_matrix'], dtype=torch.float) # 将邻接矩阵转换为张量
return atoms, adjacency_matrix
# 处理并转换 SMILES 字符串和蛋白质序列为张量格式
def process_and_convert_to_tensors(smiles, sequence, affinity, max_length=1000, ngram_size=3):
mol_graph, ngram_graphs, affinity = process_data(smiles, sequence, affinity, max_length, ngram_size) # 处理数据,生成分子图和 N-Gram 图
if mol_graph:
mol_atoms, mol_adj = tensors_from_graph(mol_graph) # 将分子图转换为张量格式
else:
mol_atoms, mol_adj = None, None
ngram_atoms_adj = []
for ngram_graph in ngram_graphs: # 将 N-Gram 图转换为张量格式
if ngram_graph:
atoms, adj = tensors_from_graph(ngram_graph)
ngram_atoms_adj.append((atoms, adj))
else:
ngram_atoms_adj.append((None, None))
# 清理中间结果,释放内存
del mol_graph, ngram_graphs
return (mol_atoms, mol_adj), ngram_atoms_adj, affinity
if __name__ == "__main__":
sample_smiles = "CCO"
sample_sequence = "MTVKTEAAKGTL"
sample_affinity = "11.1"
mol_graph_tensors, ngram_graphs_tensors, affinity = process_and_convert_to_tensors(sample_smiles, sample_sequence, sample_affinity)
print("Molecular Graph Tensors:", mol_graph_tensors)
print("N-Gram Graphs Tensors:", ngram_graphs_tensors)
print("Affinity:", affinity)