-
Notifications
You must be signed in to change notification settings - Fork 4
/
Model.cpp
414 lines (324 loc) · 11 KB
/
Model.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
#include <iostream>
#include <iomanip>
#include "LocatingArray.h"
#include "Model.h"
using namespace std;
Model::Model(VectorXf *response, int maxTerms, CSMatrix *csMatrix) {
this->response = response;
this->maxTerms = maxTerms;
this->csMatrix = csMatrix;
this->tests = response->getLength();
// allocate memory for the coefficients vector
coefVec = new float[maxTerms];
// allocate memory for the residuals vector
resiVec = new float[tests];
// allocate memory for the model response
modelResponse = new float[tests];
// add the intercept
terms = 1;
hTermIndex = new TermIndex;
hTermIndex->termIndex = 0;
hTermIndex->next = NULL;
// this is a one line way to get the same intercept above
leastSquares();
}
Model::Model(Model *model) {
this->response = model->response;
this->maxTerms = model->maxTerms;
this->csMatrix = model->csMatrix;
this->rSquared = model->rSquared;
this->terms = model->terms;
this->tests = response->getLength();
// allocate memory for the coefficients vector and copy
coefVec = new float[maxTerms];
for (int coef_i = 0; coef_i < maxTerms; coef_i++) {
coefVec[coef_i] = model->coefVec[coef_i];
}
// allocate memory for the residuals vector
resiVec = new float[tests];
for (int resi_i = 0; resi_i < tests; resi_i++) {
resiVec[resi_i] = model->resiVec[resi_i];
}
// allocate memory for the model response
modelResponse = new float[tests];
for (int resp_i = 0; resp_i < tests; resp_i++) {
modelResponse[resp_i] = model->modelResponse[resp_i];
}
// copy term index list
TermIndex **destTermIndex = &hTermIndex;
for (TermIndex *pTermIndex = model->hTermIndex; pTermIndex != NULL;
pTermIndex = pTermIndex->next) {
(*destTermIndex) = new TermIndex;
(*destTermIndex)->termIndex = pTermIndex->termIndex;
(*destTermIndex)->next = NULL;
destTermIndex = &(*destTermIndex)->next;
}
}
void Model::printModelFactors() {
int factor1i, factor2i, level1, level2;
// print out the number of terms
cout << "Model with " << terms << " terms" << endl;
// print out the headers
cout << setw(15) << right << "Coefficient" << " | " << "Term" << endl;
cout << setw(15) << setfill('-') << "" << " | " <<
setw(15) << setfill('-') << "" << setfill(' ') << endl;
// print out each term
int term_i = 0;
for (TermIndex *pTermIndex = hTermIndex; pTermIndex != NULL; pTermIndex = pTermIndex->next) {
// print out the coefficient
int termIndex = pTermIndex->termIndex;
cout << setw(15) << right << coefVec[term_i++] << " | ";
// print out the factor names and level names
cout << csMatrix->getColName(csMatrix->getCol(termIndex)) << endl;
}
// calculate adjusted r-squared (terms - 2 to not include the intercept)
float adjustedRSquared = 1 - (1 - rSquared) * (tests - 1) / (tests - terms - 2);
// print model r-squared
if (rSquared == 1) cout << "Perfect Model!!!" << endl;
cout << "R-Squared: " << rSquared << endl;
cout << "Adjusted R-Squared: " << adjustedRSquared << endl;
}
void Model::leastSquares() {
// used when accessing CS Matrix
CSCol *csCol;
// used when looping through term indices
TermIndex *pTermIndex;
// check dimensions
if (terms > tests) {
cout << "Cols (terms) cannot be more than rows (tests) to perform QR" << endl;
return;
}
// do QR here (column by column)
pTermIndex = hTermIndex;
for (int col_i = 0; col_i < terms; col_i++) {
csCol = csMatrix->getCol(pTermIndex->termIndex);
// assign initial column A[col_i] to work vector
for (int row_i = 0; row_i < tests; row_i++) {
workSpace->workVec[row_i] = csCol->dataP[row_i];
}
// subtract appropriate other vectors
float dotProd;
for (int row_i = 0; row_i < col_i; row_i++) {
// find the dot product of A[:][col_i] and Q[:][row_i]
dotProd = 0;
for (int dotrow_i = 0; dotrow_i < tests; dotrow_i++)
dotProd += csCol->dataP[dotrow_i] * workSpace->dataQ[dotrow_i][row_i];
// assign the dot product to the R matrix
workSpace->dataR[row_i][col_i] = dotProd;
// perform the necessary subtraction
for (int subrow_i = 0; subrow_i < tests; subrow_i++)
workSpace->workVec[subrow_i] -= dotProd * workSpace->dataQ[subrow_i][row_i];
}
// find the dot product of workVec and workVec
dotProd = 0;
for (int dotrow_i = 0; dotrow_i < tests; dotrow_i++)
dotProd += workSpace->workVec[dotrow_i] * workSpace->workVec[dotrow_i];
dotProd = sqrt(dotProd);
// assign the dot product to the R matrix
workSpace->dataR[col_i][col_i] = dotProd;
// assign the normalized column to Q
if (dotProd == 0) {
for (int row_i = 0; row_i < tests; row_i++)
workSpace->dataQ[row_i][col_i] = 0;
} else {
for (int row_i = 0; row_i < tests; row_i++)
workSpace->dataQ[row_i][col_i] = workSpace->workVec[row_i] / dotProd;
}
pTermIndex = pTermIndex->next;
}
/*
cout << "Q matrix" << endl;
for (int row_i = 0; row_i < tests; row_i++) {
for (int col_i = 0; col_i < terms; col_i++) {
cout << workSpace->dataQ[row_i][col_i] << "\t";
}
cout << endl;
}
cout << "R matrix" << endl;
for (int row_i = 0; row_i < terms; row_i++) {
for (int col_i = 0; col_i < terms; col_i++) {
cout << workSpace->dataR[row_i][col_i] << "\t";
}
cout << endl;
}
*/
/* m >> n
** A is (m by n)
** Q is (m by n)
** R is (n by n)
** b is (m by 1)
** Q-transpose is (n by m)
**
*/
// We now have QRx = b
// perform workVec = Q-transpose * b
// first zero out the work vector up until n (or cols)
for (int row_i = 0; row_i < terms; row_i++)
workSpace->workVec[row_i] = 0;
// we go column 1st because Q is transposed
float *responseData = response->getData();
for (int col_i = 0; col_i < tests; col_i++) {
for (int row_i = 0; row_i < terms; row_i++) {
// row and col are swapped because of the transpose
workSpace->workVec[row_i] += workSpace->dataQ[col_i][row_i] * responseData[col_i];
}
}
// We now have Rx = workVec but R is upper triangular (n by n)
float rowSolution;
for (int row_i = terms - 1; row_i >= 0; row_i--) {
// initialize row solution
rowSolution = workSpace->workVec[row_i];
// subtract other parts of LHS of equation
for (int col_i = terms - 1; col_i > row_i; col_i--) {
rowSolution -= workSpace->dataR[row_i][col_i] * coefVec[col_i];
}
// divide for final row solution
if (workSpace->dataR[row_i][row_i] == 0) {
coefVec[row_i] = 0;
} else {
coefVec[row_i] = rowSolution / workSpace->dataR[row_i][row_i];
}
}
// coefVec holds the least squares coefficients
//cout << "coefficient vector" << endl;
//for (int term_i = 0; term_i < terms; term_i++) {
// cout << coefVec[term_i] << " ";
//}
//cout << endl;
// calculate residuals and r-squared
float SSres = 0;
for (int row_i = 0; row_i < tests; row_i++) {
modelResponse[row_i] = 0;
}
// find model response
pTermIndex = hTermIndex;
for (int term_i = 0; term_i < terms; term_i++) {
csCol = csMatrix->getCol(pTermIndex->termIndex);
for (int row_i = 0; row_i < tests; row_i++) {
modelResponse[row_i] += csCol->dataP[row_i] * coefVec[term_i];
}
pTermIndex = pTermIndex->next;
}
// find residuals and SSres
for (int row_i = 0; row_i < tests; row_i++) {
resiVec[row_i] = responseData[row_i] - modelResponse[row_i];
// add the square of residual to SSres
SSres += resiVec[row_i] * resiVec[row_i];
}
// find r-squared
rSquared = 1 - (SSres / response->getSStot());
}
// get the residuals vector for this model
float *Model::getResiVec() {
return resiVec;
}
float Model::getRSquared() {
return rSquared;
}
// check if term is part of the model
bool Model::termExists(int col_i) {
for (TermIndex *pTermIndex = hTermIndex; pTermIndex != NULL; pTermIndex = pTermIndex->next) {
if (pTermIndex->termIndex == col_i) {
// term was found!
return true;
}
}
// term never found
return false;
}
// add a term (col_i of csMatrix) to the model. Returns true if added successfully
bool Model::addTerm(int col_i) {
TermIndex **pTermIndex = &hTermIndex;
for (; *pTermIndex != NULL; pTermIndex = &(*pTermIndex)->next) {
if ((*pTermIndex)->termIndex == col_i) {
// already exists so cannot insert
return false;
} else if ((*pTermIndex)->termIndex > col_i) {
// insert within the list
TermIndex *termIndex = new TermIndex;
termIndex->termIndex = col_i;
termIndex->next = (*pTermIndex);
(*pTermIndex) = termIndex;
terms++;
return true;
}
}
// create a new term index at the end
TermIndex *termIndex = new TermIndex;
termIndex->termIndex = col_i;
termIndex->next = NULL;
(*pTermIndex) = termIndex;
terms++;
return true;
}
// remove a term from the model
bool Model::removeTerm(int col_i) {
// loop through the term index list
for (TermIndex **pTermIndex = &hTermIndex; *pTermIndex != NULL; pTermIndex = &(*pTermIndex)->next) {
// check if it equals the term to be removed (col_i)
if ((*pTermIndex)->termIndex == col_i) {
// take the current term index out of the list
TermIndex *removed = (*pTermIndex);
(*pTermIndex) = removed->next;
// decrease the number of terms
terms--;
// free its memory
delete removed;
return true;
}
}
// term to remove was not found
return false;
}
int Model::getTerms() {
return terms;
}
// static
void Model::setupWorkSpace(int rows, int cols) {
// allocate initial memory
workSpace = new WorkSpace;
// allocate memory for Q
workSpace->dataQ = new float*[rows];
for (int row_i = 0; row_i < rows; row_i++)
workSpace->dataQ[row_i] = new float[cols];
// allocate memory for R
workSpace->dataR = new float*[cols];
for (int row_i = 0; row_i < cols; row_i++)
workSpace->dataR[row_i] = new float[cols];
// allocate memory for work vector
workSpace->workVec = new float[rows];
}
void Model::countOccurrences(Occurrence *occurrence) {
// count occurrences for each term
int term_i = 0;
for (TermIndex *pTermIndex = hTermIndex; pTermIndex != NULL; pTermIndex = pTermIndex->next) {
int termIndex = pTermIndex->termIndex;
float magnitude = coefVec[term_i++];
csMatrix->countOccurrences(csMatrix->getCol(termIndex), occurrence, 0, magnitude);
}
}
bool Model::isDuplicate(Model *model) {
// check if all terms match
TermIndex *p1TermIndex = hTermIndex;
TermIndex *p2TermIndex = model->hTermIndex;
while (!(p1TermIndex == NULL && p2TermIndex == NULL)) {
if (p1TermIndex->termIndex != p2TermIndex->termIndex) return false;
p1TermIndex = p1TermIndex->next;
p2TermIndex = p2TermIndex->next;
}
return true;
}
Model::~Model() {
// delete term index list
while (hTermIndex != NULL) {
TermIndex *removed = hTermIndex;
hTermIndex = hTermIndex->next;
delete removed;
}
// delete coefficients vector
delete[] coefVec;
// delete residuals vector
delete[] resiVec;
// delete model response
delete[] modelResponse;
}