-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster.py
141 lines (124 loc) · 5.73 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python3
# Construct the doculect-by-sound correspondence matrix.
# Perform agglomerative clustering.
# NOTE: This is the main program in this repo. It calls all other relevant
# scripts for preprocessing the data, performing graph co-clustering,
# and saving the results.
from align import align
from bsgc import bsgc_hierarchical
from print_output import print_clusters
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import numpy as np
import pickle
def construct_A(all_correspondences, correspondences, doculects,
min_count=0, binary=False):
# Remove rare correspondences that don't meet the min_count requirement.
corres2int = {}
i = 0
for d in correspondences.values():
for k, v in d.items():
if k in corres2int:
continue
if k not in all_correspondences:
continue
if v >= min_count:
corres2int[k] = i
i += 1
n_samples = len(doculects)
n_features = len(corres2int)
A = np.zeros((n_samples, n_features), dtype=np.int16)
for i, doculect in enumerate(doculects):
for corres, count in correspondences[doculect].items():
if corres in all_correspondences and count >= min_count:
A[i, corres2int[corres]] = count
print("Matrix shape: {}".format(A.shape))
A_original = A.copy()
if binary:
A = A.astype(np.bool_)
if min_count == 0:
all_corres = all_correspondences
else:
all_corres = sorted(corres2int.items(), key=lambda x: x[1])
all_corres = [k for (k, v) in all_corres]
return A, A_original, corres2int, all_corres
def agglomerative(A, doculects, context):
# Cluster via UPGMA and cosine similarity.
A = TfidfTransformer().fit_transform(A)
dist = 1 - cosine_similarity(A)
Z = linkage(dist, method='average')
fig, ax = plt.subplots()
dendrogram(
Z,
labels=doculects,
orientation='right',
leaf_font_size=12.)
fig.savefig('output/dendrogram-{}.pdf'.format(context),
bbox_inches='tight')
# Update cluster IDs.
n_samples = len(doculects)
cluster_ids = np.arange(n_samples, n_samples + Z.shape[0]) \
.reshape(-1, 1)
Z = np.hstack((Z, cluster_ids))
with open('output/dendrogram-{}.pickle'.format(context), 'wb') as f:
pickle.dump(Z, f)
with open('output/cosinesim-{}.pickle'.format(context), 'wb') as f:
pickle.dump(cosine_similarity(A), f)
cluster2docs = {i: [d] for i, d in enumerate(doculects)}
for row in Z:
cluster2docs[row[-1]] = cluster2docs[row[0]] + cluster2docs[row[1]]
clusters_and_doculects = [docs for _, docs in cluster2docs.items()]
return clusters_and_doculects
if __name__ == "__main__":
correspondences, all_correspondences, doculects, corres2lang2word = align(
no_context=True, context_cv=True, context_sc=True, min_count=3,
alignment_type='lib', alignment_mode='global', verbose=1)
corres_no_context = [c for c in all_correspondences if len(c) == 2]
doculect2int = {x: i for i, x in enumerate(doculects)}
print("Constructing features for tfidf-context.")
A, A_original, corres2int, all_corres = construct_A(all_correspondences,
correspondences,
doculects)
print("Creating dendrogram.")
clusters = agglomerative(A, doculects, context='context')
print("Scoring.")
print_clusters("output/tfidf-context.txt", A_original,
clusters, doculect2int, corres2int,
corres2lang2word, doculects, all_corres)
print("\nConstructing features for tfidf-nocontext.")
A, A_original, corres2int, all_corres = construct_A(corres_no_context,
correspondences,
doculects)
print("Creating dendrogram.")
clusters = agglomerative(A, doculects, context='nocontext')
print("Scoring.")
print_clusters("output/tfidf-nocontext.txt", A_original,
clusters, doculect2int, corres2int,
corres2lang2word, doculects, all_corres)
print("\nConstructing features for bsgc-context.")
A, A_original, corres2int, all_corres = construct_A(all_correspondences,
correspondences,
doculects,
binary=True)
print("Clustering.")
clusters, feature_clusters = bsgc_hierarchical(A, doculects, all_corres)
print("Scoring.")
print_clusters("output/bsgc-context.txt", A_original,
clusters, doculect2int, corres2int,
corres2lang2word, doculects, all_corres,
feature_clusters)
print("\nConstructing features for bsgc-nocontext.")
A, A_original, corres2int, all_corres = construct_A(corres_no_context,
correspondences,
doculects,
binary=True)
print("Clustering.")
clusters, feature_clusters = bsgc_hierarchical(A, doculects, all_corres)
print("Scoring.")
print_clusters("output/bsgc-nocontext.txt", A_original,
clusters, doculect2int, corres2int,
corres2lang2word, doculects, all_corres,
feature_clusters)