-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering.py
67 lines (54 loc) · 1.65 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import sys
#from mpl_toolkits.mplot3d import Axes3D
f = input('Enter a Stock : ')
df = pd.read_csv('sentimentdata/'+ f +'sentiment.csv',index_col=0,
encoding='latin-1')
dfold = df
df = df.drop('text', 1)
print(df.head())
df = df.drop('date', 1)
print(df.head())
df_tr = df
# select proper number of clusters
'''
Y = df[['followers']]
X = df[['polarity']]
Nc = range(1, 20)
kmeans = [KMeans(n_clusters=i) for i in Nc]
score = [kmeans[i].fit(Y).score(Y) for i in range(len(kmeans))]
plt.plot(Nc,score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()
'''
# elbow plot showed the point of dropoff to be around 5 clusters
#Standardize
clmns = ['followers', 'polarity', 'sentiment_confidence']
df_tr_std= stats.zscore(df_tr[clmns])
#Clustering
kmeans = KMeans(n_clusters=5, random_state=0).fit(df_tr_std)
labels = kmeans.labels_
#Glue back to original data
df_tr['clusters']=labels
dfold['clusters']=labels
clmns.extend(['clusters'])
print(df_tr[clmns].groupby(['clusters']).mean())
#Scatter plot of polarity and confidence
sns.lmplot('polarity', 'sentiment_confidence',
data=df_tr,
fit_reg=False,
hue="clusters",
scatter_kws={"marker": "D",
"s": 20})
dfold.to_csv('clusterdata/'+ f +'cluster.csv')
plt.title('tweets grouped by polarity and sentiment_confidence')
plt.xlabel('polarity')
plt.ylabel('sentiment_confidences')
plt.show()