-
Notifications
You must be signed in to change notification settings - Fork 0
/
impute_aux_donald.R
89 lines (70 loc) · 3.31 KB
/
impute_aux_donald.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
rm(list=ls())
library(missForest)
source('helper/make_dummy.R') # create dummies for categorical variables
source('helper/clean_help.R') # check for constant variables
source('helper/fill_na.R') # fill na with mean or most frequent cat
source('helper/save_py.R') # fill na with mean or most frequent cat
data_out<-'data/data_out/'
data_out_py<-'HI-VAE/data_python/' #originally data/HI-VAE/data_python/
###################### Imputation & AUX
data_all<-readRDS(file = paste0("data/donald-data_condensed.rds"))
data_aux=list()
for (datan in names(data_all)){ # for every variable group
# load data & remove SUBJID
data<-data_all[[datan]]
pt<-data$SUBJID
data$SUBJID<-NULL
#if (!grepl('stalone_VIS6|stalone_VIS12|stalone_VIS24|snp_VIS1', datan)){ #JS: does this skip all vargroups with only 1 column, as they can't be accessed by [x,y]?
if (!grepl('stalone_VIS00', datan)){
# remove bad data
data=data[,includeVar(data)]
#data=data[,rmMiss(data)] #JS: this would remove all data from later visits, as many participants are too young
}
###################### AUX variables
# make AUX columns and save in separate list (with SUBJID)
# AUX if A) stalone missing, or B) all/any entries of a vargroup are missing: which one?
nms<-colnames(data)
if (grepl('stalone', datan)){
dataux<-as.data.frame(sapply(as.data.frame(is.na(data)), as.numeric)) #A
dataux<-as.data.frame(sapply(dataux,factor))
colnames(dataux)<-paste('AUX',nms,sep='_')
}else{
dataux<-data.frame(factor(apply(data,1,function(x) as.numeric(all(is.na(x)))))) #B: all( or any(is.na(x)) ?
colnames(dataux)<-paste('AUX',datan,sep='_')
}
# update AUX list
dataux$SUBJID<-pt
data_aux[[datan]]<-dataux
###################### Imputation
print(datan)
if (grepl('stalone', datan)){
data<-fillna(data) # if standalone data, mean and most frequent class imputation
data$SA_fam_ID_VIS00 <- as.factor(data$SA_fam_ID_VIS00)
data$SA_sex_VIS00 <- as.factor(data$SA_sex_VIS00)
}
# if (!grepl('stalone_VIS6|stalone_VIS12|stalone_VIS24|snp_VIS1', datan)){
if (!grepl('stalone_VIS00', datan)){
# remove bad data
data=data[,includeVar(data)]
#data=data[,rmMiss(data)] #JS: this would remove all data from later visits, as many participants are too young
}
# add ppt variable and update data list
data$SUBJID <- pt
data_all[[datan]]<-data
# save out csv's of scaled continous and dummy coded categorical data for autoencoders
pt<-data$SUBJID
data$SUBJID<-NULL
#missing write
if (!grepl('stalone', datan))
write.table(which(is.na(data), arr.ind=TRUE),paste0(data_out_py,datan,'_missing.csv'),sep=',',row.names = F,col.names = F,quote=F)
#data write
if (!grepl('stalone', datan))
write.table(data,paste0(data_out_py,datan,'.csv'),sep=',',row.names = F,col.names = F,quote=F, na = "NaN")
write.table(as.character(pt),paste0('HI-VAE/python_names/',datan,'_subj.csv'),sep=',',row.names = F,col.names = T,quote=T, na = "NaN") #originally data/HI-VAE/python_names/
write.table(colnames(data),paste0('HI-VAE/python_names/',datan,'_cols.csv'),sep=',',row.names = F,col.names = T,quote=T, na = "NaN") #originally data/HI-VAE/python_names/
}
# save all
saveRDS(data_all, file = paste0(data_out,'data_all_imp.rds'))
saveRDS(data_aux, file = paste0(data_out,'data_aux.rds'))
library(beepr)
beep()