-
Notifications
You must be signed in to change notification settings - Fork 2
/
eval_utils.py
58 lines (49 loc) · 2.1 KB
/
eval_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import numpy as np
from sklearn import metrics
from munkres import Munkres
def cluster_metric(label, pred):
nmi = metrics.normalized_mutual_info_score(label, pred)
ari = metrics.adjusted_rand_score(label, pred)
pred_adjusted = get_y_preds(label, pred, len(set(label)))
acc = metrics.accuracy_score(pred_adjusted, label)
print(
"[Clustering Result]: ACC = {:.2f}, NMI = {:.2f}, ARI = {:.2f}".format(
acc * 100, nmi * 100, ari * 100
)
)
def calculate_cost_matrix(C, n_clusters):
cost_matrix = np.zeros((n_clusters, n_clusters))
# cost_matrix[i,j] will be the cost of assigning cluster i to label j
for j in range(n_clusters):
s = np.sum(C[:, j]) # number of examples in cluster i
for i in range(n_clusters):
t = C[i, j]
cost_matrix[j, i] = s - t
return cost_matrix
def get_cluster_labels_from_indices(indices):
n_clusters = len(indices)
cluster_labels = np.zeros(n_clusters)
for i in range(n_clusters):
cluster_labels[i] = indices[i][1]
return cluster_labels
def get_y_preds(y_true, cluster_assignments, n_clusters):
"""
Computes the predicted labels, where label assignments now
correspond to the actual labels in y_true (as estimated by Munkres)
cluster_assignments: array of labels, outputted by kmeans
y_true: true labels
n_clusters: number of clusters in the dataset
returns: a tuple containing the accuracy and confusion matrix,
in that order
"""
confusion_matrix = metrics.confusion_matrix(
y_true, cluster_assignments, labels=None
)
# compute accuracy based on optimal 1:1 assignment of clusters to labels
cost_matrix = calculate_cost_matrix(confusion_matrix, n_clusters)
indices = Munkres().compute(cost_matrix)
kmeans_to_true_cluster_labels = get_cluster_labels_from_indices(indices)
if np.min(cluster_assignments) != 0:
cluster_assignments = cluster_assignments - np.min(cluster_assignments)
y_pred = kmeans_to_true_cluster_labels[cluster_assignments]
return y_pred