-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_dictionary.R
105 lines (76 loc) · 2.25 KB
/
create_dictionary.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# load packages -----------------------------------------------------------
if(!require(e1071)) install.packages("e1071"); require(e1071)
library(data.table)
# define functions --------------------------------------------------------
rmse <- function(x,y){
return(sqrt(mean((x-y)^2)))
}
# load (small) data -------------------------------------------------------
# pricipal components scores
pcs <- read.csv("pc166.csv")
states <- as.character(pcs$states)
pcs <- pcs[,2:4]
pcs[,1] <- -pcs[,1]
rownames(pcs)<-states
#pcs <- apply(pcs,2,rs01)
# vectors for states
svecs <- read.csv("state_vectors.csv",header=F)
rownames(svecs)<-svecs$V1
svecs <- svecs[,2:301]
# test method on state words ----------------------------------------------
# svm-regression version
set.seed(1)
cvinds <- sample(rep(1:5,34)[1:166])
cperf <- matrix(NA,5,3)
eperf <- matrix(NA,5,3)
for (i in 1:5){
tsel <- cvinds == i
pcs.train <- pcs[!tsel,]
svecs.train <- svecs[!tsel,]
pcs.test <- pcs[tsel,]
svecs.test <- svecs[tsel,]
for (j in 1:3){
y <- pcs.train[,j]
x <- as.matrix(svecs.train)
fit <- svm(y~x,kernel="radial",cost = 4)
x <- as.matrix(svecs.test)
preds <- predict(fit,x)
cperf[i,j]<-cor(preds,pcs.test[,j])
eperf[i,j]<-rmse(preds,pcs.test[,j])
}
}
colMeans(cperf)
colMeans(eperf)
# fit full model
svmlist <- list()
for (i in 1:3){
y <- pcs[,i]
x <- as.matrix(svecs)
svmlist[[i]]<-svm(y~x,kernel="radial",cost = 4)
}
# cycle through vectors ---------------------------------------------------
#fast <- fread("./crawl-300d-2M.vec/crawl-300d-2M.vec",skip = 1)
#save(fast,file="fast.Rdata")
load("fast.Rdata")
fast <- as.data.frame(fast)
tokens <- as.character(fast$V1)
nvec <- dim(fast)[1]
dict <- matrix(NA,nvec,3)
start <- proc.time()
for (v in 1:nvec) {
x <- matrix(as.numeric(fast[v,2:301]),1,300)
for (i in 1:3){
dict[v,i] <- predict(svmlist[[i]],x)
}
if ((v %% 1000)==0){
print(v)
print(proc.time()-start)
}
}
save(tokens,dict,file = "3daffect.Rdata")
rownames(dict)<-tokens
colnames(dict)<-gsub("\\."," ",colnames(pcs))
write.csv(dict,"3daffect_dict.csv")
# 2 million tokens with weighted valence vs:
# 407 positive emotion and 501 negative emotion words in LIWC
# 1747 positive and 4086 negative emotion words in qdap