forked from fizx/libbow-osx
-
Notifications
You must be signed in to change notification settings - Fork 0
/
svm_base.c
2512 lines (2175 loc) · 70.3 KB
/
svm_base.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* Copyright (C) 1999 Greg Schohn - [email protected] */
/* "main" file for all of the svm related code - any svm stuff should
* pass through some function here */
#include <bow/svm.h>
#if !HAVE_SQRTF
#define sqrtf sqrt
#endif
#define BARREL_GET_MAX_NSV(barrel) (*((int *) &((GET_CDOC_ARRAY_EL(barrel,0))->normalizer)))
#define BARREL_GET_NCLASSES(barrel) (*((int *) &((GET_CDOC_ARRAY_EL(barrel,0))->prior)))
#define BARREL_GET_NMETA_DOCS(barrel) (*((int *) &((GET_CDOC_ARRAY_EL(barrel,1))->normalizer)))
#define KERNEL_TYPE 14001
#define WEIGHT_TYPE 14002
#define COST_TYPE 14003
#define EA_TYPE 14004
#define BSIZE_TYPE 14005
#define VOTE_TYPE 14006
#define CACHE_SIZE_ARG 14007
#define QUICK_SCORE 14008
#define DF_COUNTS_ARG 14009
#define REMOVE_MISCLASS_TYPE 14010
#define TF_TRANSFORM_TYPE 14011
#define USE_SMO_ARG 14012
#define CNAME_ARG 14013
#define LNAME_ARG 14014
#define DO_ACTIVE_LEARNING 14015
#define ACTIVE_LEARNING_CHUNK_SIZE_ARG 14016
#define AL_TEST_IN_TRAIN_ARG 14017
#define AL_BASELINE 14018
#define START_AT_ARG 14019
#define RANDOM_SEED_ARG 14020
#define SUPPRESS_SCORE_MAT_ARG 14021
#define INITIAL_AL_TSET_ARG 14022
#define TRANSDUCE_CLASS_ARG 14023
#define TRANS_CSTAR_ARG 14024
#define TRANS_NPOS_ARG 14025
#define SVM_BASENAME_ARG 14026
#define AL_WITH_TRANS_ARG 14027
#define TRANS_IGNORE_BIAS_ARG 14028
#define TRANS_HYP_REFRESH_ARG 14029
#define TRANS_SMART_VALS_ARG 14030
#define AGAINST_ALL 0
#define PAIRWISE 1
#define REMOVE_BOUND 1
#define REMOVE_WRONG 2
static int weight_type=RAW; /* 0=raw_freq, 1=tfidf, 2=infogain */
static int tf_transform_type=RAW; /* 0=raw, 1=log, 2?... */
static int vote_type=0;
static int cache_size=4000037;
static int quick_scoring=1;
static int do_active_learning=0;
static int test_in_train=0;
static int suppress_score_mat=0;
static int al_pick_random=0;
static int model_starting_no=0;
/* here's a C hack - it uses the actual of the enum to do the shift
* make sure when passing arguments, you know what the actuals are */
static int transduce_class=(1 << bow_doc_unlabeled);
static int transduce_class_overriding=0; /* gets set to 1 when args are
* passed to override */
static char *svml_basename=NULL;
FILE *svml_test_file=NULL;
#ifdef HAVE_LOQO
int svm_use_smo=0;
#else
int svm_use_smo=1;
#endif
double svm_epsilon_a=1E-12; /* for alpha's & there bounds */
double svm_epsilon_crit=INIT_KKT; /* for critical KT points */
double svm_C=1000.0; /* maximum cost */
int svm_bsize=4; /* sizeof working set */
int svm_kernel_type=0; /* 0=linear */
int svm_remove_misclassified=0;
int svm_weight_style;
int svm_nkc_calls;
int svm_trans_npos;
int svm_trans_nobias=0;
int svm_trans_hyp_refresh=40;
int svm_trans_smart_vals=1;
double svm_trans_cstar=200;
int svm_init_al_tset=8;
int svm_al_qsize;
int svm_al_do_trans=0;
int svm_random_seed=0; /* for al - gets filled in with time */
int svm_verbosity=0;
/* for tfidf scoring - they could (should?) be made into options... */
static int df_transform=LOG;
static int df_counts=bow_tfidf_occurrences;
/* these are dangerous optimizations for svm_score... - but they save a lot of time... */
/* dangerous because they waste a lot of memory (about the size of the original barrel)
* & if the vpc barrel gets played with, then its all wrong & there's no totally
* error proof way to do that without checking all of the barrel, which i don't do. */
struct model_bucket {
bow_wv **docs;
float **oweights; /* original weights (after norm & tf scaling)
note - this only matters when tf_transform is set &
some weight_per_model scheme is used */
/* note - these are regular vectors instead of wv's to save time
* (O(# qwv features) instead of O((# qwv features) + (# of features)) */
union {
float **sub_model; /* weights for submodels */
float *barrel; /* weights for the whole barrel */
} word_weights;
double *bvect;
int **indices;
int *sizes; /* length of each array */
double **weights;
double **W;
int **yvect;
bow_barrel *barrel;
int ndocs;
int nmodels;
};
static struct model_bucket model_cache = {NULL, NULL, {NULL}, NULL, NULL, NULL,
NULL, NULL, NULL, 0, 0};
double dprod(bow_wv *wv1, bow_wv *wv2);
double kernel_poly(bow_wv *wv1, bow_wv *wv2);
double kernel_rbf(bow_wv *wv1, bow_wv *wv2);
double kernel_sig(bow_wv *wv1, bow_wv *wv2);
/* by default use the dot product as the kernel */
static double (*kernel)(bow_wv *, bow_wv *) = dprod;
/* Command-line options specific to SVMs */
static struct argp_option svm_options[] = {
{0,0,0,0,
"Support Vector Machine options, --method=svm:", 50},
{"svm-active-learning-baseline", AL_BASELINE, "", 0,
"Incrementally add documents to the training set at random."},
{"svm-test-in-train", AL_TEST_IN_TRAIN_ARG, 0, 0,
"do active learning testing inside of the training... a hack "
"around making code 10 times more complicated."},
{"svm-al-transduce", AL_WITH_TRANS_ARG, 0, 0,
"do transduction over the unlabeled data during active learning."},
{"svm-bsize", BSIZE_TYPE, "", 0,
"maximum size to construct the subproblems."},
{"svm-cache-size", CACHE_SIZE_ARG, "", 0,
"Number of kernel evaluations to cache."},
{"svm-cost", COST_TYPE, "", 0,
"cost to bound the lagrange multipliers by (default 1000)."},
{"svm-df-counts", DF_COUNTS_ARG, "", 0,
"Set df_counts (0=occurrences, 1=words)."},
{"svm-active-learning", DO_ACTIVE_LEARNING, "", 0,
"Use active learning to query the labels & incrementally (by arg_size) build the barrels."},
{"svm-epsilon_a", EA_TYPE, "", 0,
"tolerance for the bounds of the lagrange multipliers (default 0.0001)."},
{"svm-kernel", KERNEL_TYPE, "", 0,
"type of kernel to use (0=linear, 1=polynomial, 2=gassian, 3=sigmoid, 4=fisher kernel)."},
{"svm-al_init_tsetsize", INITIAL_AL_TSET_ARG, "", 0,
"Number of random documents to start with in active learning."},
{"svm-quick-scoring", QUICK_SCORE, 0, 0,
"Turn quick scoring on."},
{"svm-rseed", RANDOM_SEED_ARG, "", 0,
"what random seed should be used in the test-in-train splits"},
{"svm-remove-misclassified", REMOVE_MISCLASS_TYPE, "", 0,
"Remove all of the misclassified examples and retrain (default none (0), 1=bound, 2=wrong."},
{"svm-start-at", START_AT_ARG, "", 0,
"which model should be the first generated."},
{"svm-suppress-score-matrix", SUPPRESS_SCORE_MAT_ARG, 0, 0,
"Do not print the scores of each test document at each AL iteration."},
{"svml-basename", SVM_BASENAME_ARG, "", OPTION_HIDDEN, ""},
{"svm-tf-transform", TF_TRANSFORM_TYPE, "", 0,
"0=raw, 1=log..."},
{"svm-transduce-class", TRANSDUCE_CLASS_ARG, "", 0,
"override default class(es) (int) to do transduction with "
"(default bow_doc_unlabeled)."},
{"svm-trans-cost", TRANS_CSTAR_ARG, "", 0,
"value to assign to C* (default 200)."},
{"svm-trans-hyp-refresh", TRANS_HYP_REFRESH_ARG, "", 0,
"how often the hyperplane should be recomputed during transduction. "
"Only applies to SMO. (default 40)"},
{"svm-trans-nobias", TRANS_IGNORE_BIAS_ARG, 0, 0,
"Do not use a bias when marking unlabeled documents. Use a "
"threshold of 0 to determine labels instead of some threshold to"
"mark a certain number of documents for each class."},
{"svm-trans-npos", TRANS_NPOS_ARG, "", 0,
"number of unlabeled documents to label as positive "
"(default: proportional to number of labeled positive docs)."},
{"svm-trans-smart-vals", TRANS_SMART_VALS_ARG, "", 0,
"use previous problem's as a starting point for the next. (default true)"},
{"svm-use-smo", USE_SMO_ARG, "", 0,
#ifdef HAVE_LOQO
"default 0 (don't use SMO)"
#else
"default 1 (use SMO) - PR_LOQO not compiled"
#endif
},
{"svm-vote", VOTE_TYPE, "", 0,
"Type of voting to use (0=singular, 1=pairwise; default 0)."},
{"svm-weight", WEIGHT_TYPE, "", 0,
"type of function to use to set the weights of the documents' words "
"(0=raw_frequency, 1=tfidf, 2=infogain."},
{0, 0}
};
union kern_param {
struct {
double const_co;
double lin_co;
double degree;
} poly ;
struct {
double gamma;
} rbf;
struct {
double const_co;
double lin_co;
} sig;
};
union kern_param kparm;
error_t svm_parse_opt (int key, char *arg, struct argp_state *state) {
switch (key) {
case START_AT_ARG:
model_starting_no = atoi(arg);
break;
case KERNEL_TYPE:
svm_kernel_type = atoi (arg);
if (svm_kernel_type > 4) {
fprintf(stderr, "Invalid value for -k, value must be between 0, 1, 2, 3, or 4.\n");
return ARGP_ERR_UNKNOWN;
}
switch (svm_kernel_type) {
case 0:
kernel = dprod;
break;
case 1:
kparm.poly.const_co = 1.0;
kparm.poly.lin_co = 1.0;
kparm.poly.degree = 4.0;
kernel = kernel_poly;
break;
case 2:
kparm.rbf.gamma = 1.0;
kernel = kernel_rbf;
break;
case 3:
kparm.sig.lin_co = 1.0;
kparm.sig.const_co = 0.0;
kernel = kernel_sig;
break;
case 4:
kernel = svm_kernel_fisher;
break;
}
break;
case AL_TEST_IN_TRAIN_ARG:
test_in_train = 1;
break;
case AL_WITH_TRANS_ARG:
svm_al_do_trans = 1;
break;
case BSIZE_TYPE:
svm_bsize = atoi(arg);
if (svm_bsize < 2) {
fprintf(stderr, "Invalid value for -b, value must be at least 2.\n");
return ARGP_ERR_UNKNOWN;
}
svm_bsize = ((svm_bsize+3)/4)*4;
break;
case CACHE_SIZE_ARG:
cache_size = atoi(arg);
if (cache_size < 2) {
fprintf(stderr, "Invalid value for --cache_size, value must be greater than 1\n");
return ARGP_ERR_UNKNOWN;
}
break;
case COST_TYPE:
svm_C = atof(arg);
break;
case DF_COUNTS_ARG:
key = atoi(arg);
if (key == 0) {
df_counts = bow_tfidf_occurrences;
} else if (key == 1) {
df_counts = bow_tfidf_words;
} else {
return ARGP_ERR_UNKNOWN;
}
break;
case EA_TYPE:
svm_epsilon_a = atof(arg);
break;
case AL_BASELINE:
test_in_train = 1;
al_pick_random = 1;
case DO_ACTIVE_LEARNING:
do_active_learning = 1;
svm_al_qsize = atoi(arg);
if (svm_al_qsize < 0) {
fprintf(stderr, "Bogus AL-query size\n");
return ARGP_ERR_UNKNOWN;
}
break;
case INITIAL_AL_TSET_ARG:
svm_init_al_tset = atoi(arg);
break;
case REMOVE_MISCLASS_TYPE:
svm_remove_misclassified = atoi(arg);
break;
case RANDOM_SEED_ARG:
svm_random_seed = atoi(arg);
break;
case QUICK_SCORE:
quick_scoring = 1;
break;
case SUPPRESS_SCORE_MAT_ARG:
suppress_score_mat = 1;
break;
case SVM_BASENAME_ARG:
svml_basename = arg;
break;
case TF_TRANSFORM_TYPE:
tf_transform_type = atoi(arg);
if ((tf_transform_type < 0) || (tf_transform_type > 1)) {
fprintf(stderr, "Invalid value for tf_transform_type, value must be 0 or 1\n");
return ARGP_ERR_UNKNOWN;
}
break;
case TRANSDUCE_CLASS_ARG:
{
int a;
a = atoi(arg);
if (a == bow_doc_train) {
fprintf(stderr,"Cannot do transduction on training set, ignoring \"%s\" option\n",arg);
} else {
if (!transduce_class) {
transduce_class_overriding = 1;
transduce_class = 0;
}
/* < 0 turns transduction off */
if (a > 0) {
transduce_class |= (1 << a);
}
}
}
break;
case TRANS_HYP_REFRESH_ARG:
svm_trans_hyp_refresh = atoi(arg);
if (svm_trans_hyp_refresh < 1) {
fprintf(stderr, "svm_trans_hyp_refresh (hyperplane refresh rate)"
" must be greater than 0\n");
}
break;
case TRANS_IGNORE_BIAS_ARG:
svm_trans_nobias = 1;
break;
case TRANS_NPOS_ARG:
svm_trans_npos = atoi(arg);
if (svm_trans_npos < 1) {
fprintf(stderr, "svm_trans_npos should be greater than 0.\n");
return ARGP_ERR_UNKNOWN;
}
break;
case TRANS_CSTAR_ARG:
svm_trans_cstar = atof(arg);
break;
case TRANS_SMART_VALS_ARG:
svm_trans_smart_vals = atoi(arg);
break;
case USE_SMO_ARG:
svm_use_smo = atoi(arg);
/* the epsilon is used is 2x as big as it would be in the loqo method */
if (svm_use_smo == 1) {
svm_epsilon_crit /= 2;
}
#ifndef HAVE_LOQO
if (svm_use_smo != 1) {
fprintf(stderr,"Cannot switch from SMO, no other solvers were built,\n"
"rebuild libbow with pr_loqo to use another algorithm.\n");
}
#endif
break;
case VOTE_TYPE:
vote_type = atoi(arg);
if ((vote_type < 0) || (vote_type > 1)) {
fprintf(stderr, "Invalid value for --vote, value must be 0 for linear or 1 for pairwise.\n");
return ARGP_ERR_UNKNOWN;
}
break;
case WEIGHT_TYPE:
weight_type = atoi(arg);
if ((weight_type < 0) || (weight_type > 3)) {
fprintf(stderr, "Invalid value for -w, value must be 0, 1, 2, or 3.\n");
return ARGP_ERR_UNKNOWN;
}
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
static const struct argp svm_argp = { svm_options, svm_parse_opt };
static struct argp_child svm_argp_child = {
&svm_argp, /* This child's argp structure */
0, /* flags for child */
0, /* optional header in help message */
0 /* arbitrary group number for ordering */
};
void svm_permute_data(int *permute_table, bow_wv **docs, int *yvect, int ndocs) {
int i, j;
for (i=0; i<ndocs; i++) {
permute_table[i] = i;
}
for (i=0; i<ndocs; i++) {
bow_wv *d;
int y;
j = random() % ndocs;
d = docs[j];
docs[j] = docs[i];
docs[i] = d;
y = yvect[j];
yvect[j] = yvect[i];
yvect[i] = y;
y = permute_table[j];
permute_table[j] = permute_table[i];
permute_table[i] = y;
}
}
void svm_unpermute_data(int *permute_table, bow_wv **docs, int *yvect, int ndocs) {
int i, j;
for (i=0; i<ndocs; ) {
bow_wv *d;
int y;
j = permute_table[i];
if (j == i) {
i++;
continue;
}
d = docs[j];
docs[j] = docs[i];
docs[i] = d;
y = yvect[j];
yvect[j] = yvect[i];
yvect[i] = y;
y = permute_table[j];
permute_table[j] = permute_table[i];
permute_table[i] = y;
}
}
/* Right now, the vectors it looks at are the raw freq vectors */
double dprod(bow_wv *wv1, bow_wv *wv2) {
double sum;
bow_we *v1, *v2;
int i1, i2;
i1 = i2 = 0;
sum = 0.0;
v1 = wv1->entry;
v2 = wv2->entry;
while ((i1 < wv1->num_entries) && (i2 < wv2->num_entries)) {
if(v1[i1].wi > v2[i2].wi) {
i2++;
}
else if (v1[i1].wi < v2[i2].wi) {
i1++;
}
else {
sum += (v1[i1].weight) * (v2[i2].weight);
i1++;
i2++;
}
}
return(sum);
}
/* dot product between a sparce & non-sparse vector */
double dprod_sd(bow_wv *wv, double *W) {
double sum;
bow_we *v;
int i;
i = 0;
sum = 0.0;
v = wv->entry;
while (i < wv->num_entries) {
sum += v[i].weight * W[v[i].wi];
i++;
}
return(sum);
}
/* this is a whole different function just because the kernel is the biggest bottleneck */
double ddprod(bow_wv *wv1, bow_wv *wv2) {
double tmp;
double sum;
bow_we *v1, *v2;
int i1, i2;
i1 = i2 = 0;
sum = 0.0;
v1 = wv1->entry;
v2 = wv2->entry;
while ((i1 < wv1->num_entries) && (i2 < wv2->num_entries)) {
if(v1[i1].wi > v2[i2].wi) {
i2++;
}
else if (v1[i1].wi < v2[i2].wi) {
i1++;
}
else {
tmp = (v1[i1].weight) - (v2[i2].weight);
sum += tmp*tmp;
i1++;
i2++;
}
}
return(sum);
}
/* End of command-line options specific to SVMs */
double kernel_poly(bow_wv *wv1, bow_wv *wv2) {
return (pow(kparm.poly.lin_co * dprod(wv1,wv2) +
kparm.poly.const_co, kparm.poly.degree));
}
double kernel_rbf(bow_wv *wv1, bow_wv *wv2) {
return (exp(-1*kparm.rbf.gamma * (ddprod(wv1,wv2))));
}
double kernel_sig(bow_wv *wv1, bow_wv *wv2) {
return(tanh(kparm.sig.lin_co * dprod(wv1,wv2)+kparm.sig.const_co));
}
static int rlength;
typedef struct _kc_el {
bow_wv *i, *j;
double val;
unsigned int age;
} kc_el;
static kc_el *harray;
static unsigned int max_age;
void kcache_init(int nwide) {
int i;
max_age = 1;
svm_nkc_calls = 0;
rlength = nwide;
if ((harray = (kc_el *) malloc(sizeof(kc_el)*cache_size)) == NULL) {
cache_size = cache_size/2;
fprintf(stderr, "Could not allocate space for the kernel cache.\n"
"Shrinking size to %d and trying again.\n", cache_size);
return (kcache_init(nwide));
}
for (i=0; i<cache_size; i++) {
harray[i].i = (bow_wv *) ~0;
harray[i].age = 0;
}
}
void kcache_clear() {
free(harray);
}
void kcache_age() {
max_age++;
}
#define NHASHES 3
static int sub_nkcc=0; /* this makes nkc_calls = actual calls / 100 */
double svm_kernel_cache(bow_wv *wv1, bow_wv *wv2) {
int h_index;
int k;
unsigned int min_age, min_from;
double d;
if (!((sub_nkcc++) % 100)) {
svm_nkc_calls ++;
}
min_age = ~((unsigned long) 0);
/* all of the kernels are symetric */
if (wv1>wv2) {
bow_wv *tmp;
tmp = wv2;
wv2 = wv1;
wv1 = tmp;
}
for (k=h_index=0; k<NHASHES; k++) {
h_index = ((((unsigned int)wv1)*rlength+((unsigned int)wv2))+h_index+19) % cache_size;
if ((harray[h_index].i == wv1) && (harray[h_index].j == wv2)) {
harray[h_index].age = max_age;
return (harray[h_index].val);
} else {
if (harray[h_index].age > 0) {
if (min_age > harray[h_index].age) {
min_age = harray[h_index].age;
min_from = h_index;
}
continue;
} else {
min_from = h_index;
break;
}
}
}
d = kernel(wv1,wv2);
harray[min_from].i = wv1;
harray[min_from].j = wv2;
harray[min_from].val = d;
harray[min_from].age = max_age;
return (d);
}
/* don't add the evaluation (useful if the items are getting deleted from a set) */
double svm_kernel_cache_lookup(bow_wv *wv1, bow_wv *wv2) {
int h_index;
int k;
/* all of the kernels are symetric */
if (wv1>wv2) {
bow_wv *tmp;
tmp = wv2;
wv2 = wv1;
wv1 = tmp;
}
for (k=h_index=0; k<NHASHES; k++) {
h_index = ((((unsigned int)wv1)*rlength+((unsigned int)wv2))+h_index+19) % cache_size;
if ((harray[h_index].i == wv1) && (harray[h_index].j == wv2)) {
return (harray[h_index].val);
}
}
return (kernel(wv1,wv2));
}
/* random qsort helpers */
int di_cmp(const void *v1, const void *v2) {
double d1, d2;
d1 = ((struct di *) v1)->d;
d2 = ((struct di *) v2)->d;
if (d1 < d2) {
return (-1);
} else if (d1 > d2) {
return (1);
} else {
return 0;
}
}
int i_cmp(const void *v1, const void *v2) {
int d1, d2;
d1 = *((int *) v1);
d2 = *((int *) v2);
if (d1 < d2) {
return (-1);
} else if (d1 > d2) {
return (1);
} else {
return 0;
}
}
int d_cmp(const void *v1, const void *v2) {
double d1, d2;
d1 = *((double *) v1);
d2 = *((double *) v2);
if (d1 < d2) {
return (-1);
} else if (d1 > d2) {
return (1);
} else {
return 0;
}
}
int s_cmp(const void *v1, const void *v2) {
bow_score *s1, *s2;
s1 = ((bow_score *) v1);
s2 = ((bow_score *) v2);
if (s1->weight < s2->weight) {
return (1);
} else if (s1->weight > s2->weight) {
return (-1);
} else {
if (s1->di < s2->di) {
return (-1);
} else if (s1->di > s2->di) {
return (1);
} else {
return 0;
}
}
}
/* useful alternative to qsort or radix sort */
/* stick the top n values in the first n slots of arr */
void get_top_n(struct di *arr, int len, int n) {
double mind, tmpd;
int minfrom, tmpi;
int i,j;
if (len < n) {
return;
}
for (i=0; i<n && i<len; i++) {
mind = arr[i].d;
minfrom = i;
for (j=i+1; j<len; j++) {
if (arr[j].d < mind) {
mind = arr[j].d;
minfrom = j;
}
}
tmpi = arr[minfrom].i;
tmpd = arr[minfrom].d;
arr[minfrom].d = arr[i].d;
arr[minfrom].i = arr[i].i;
arr[i].d = tmpd;
arr[i].i = tmpi;
}
return;
}
/* takes in docs, creates an idf vector & then weights the document */
/* sets doc weights by using counts & normalizer */
static float *tfidf(bow_wv **docs, int ntrain) {
float idf_sum; /* sum of all the idf values */
int max_wi; /* the highest "word index" */
float *new_idf_vect;
int i, j;
bow_verbosify (bow_progress, "Setting weights over words: ");
max_wi = bow_num_words();
new_idf_vect = (float *) malloc(sizeof(float)*max_wi);
for (i=0; i<max_wi; i++) {
new_idf_vect[i] = 0.0;
}
idf_sum = 0.0;
/* First calculate document frequencies. */
for (i=0; i<ntrain; i++) {
for (j=0; j<docs[i]->num_entries; j++) {
if (df_counts == bow_tfidf_occurrences) {
/* Make DV be the number of documents in which word WI occurs
at least once. (We can't just set it to DV->LENGTH because
we have to check to make sure each document is part of the
model. */
new_idf_vect[docs[i]->entry[j].wi] ++;
} else if (df_counts == bow_tfidf_words) {
/* Make DV be the total number of times word WI appears
in any document. */
new_idf_vect[docs[i]->entry[j].wi] += docs[i]->entry[j].count;
} else {
bow_error ("Bad TFIDF parameter df_counts.");
}
}
}
for (i=0; i<max_wi; i++) {
/* Set IDF from DF. */
/* following what Thorsten alledgedly does */
if (new_idf_vect[i] >= 3.0) {
if (df_transform == LOG)
new_idf_vect[i] = log2f (ntrain / new_idf_vect[i]);
else if (df_transform == SQRT)
new_idf_vect[i] = sqrtf (ntrain / new_idf_vect[i]);
else if (df_transform == RAW)
new_idf_vect[i] = ntrain / new_idf_vect[i];
else {
new_idf_vect[i] = 0; /* to avoid gcc warning */
bow_error ("Bad TFIDF parameter df_transform.");
}
idf_sum += new_idf_vect[i];
} else {
new_idf_vect[i] = 0.0;
}
}
/* "normalize" the idf values */
for (i=0; i<max_wi; i++) {
/* Get the document vector for this word WI */
new_idf_vect[i] = max_wi*new_idf_vect[i]/idf_sum;
}
bow_verbosify (bow_progress, "\n");
return new_idf_vect;
}
/* next 2 fn's stolen from info-gain.c */
/* Return the entropy given counts for each type of element. */
static double entropy(float e1, float e2) {
double total = 0; /* How many elements we have in total */
double entropy = 0.0;
double fraction;
total = e1 + e2;
/* If we have no elements, then the entropy is zero. */
if (total == 0) {
return 0.0;
}
entropy = 0.0;
/* Now calculate the entropy */
fraction = e1 / total;
if (fraction != 0.0) {
entropy = -1 * fraction * log2f (fraction);
}
fraction = e2 / total;
if (fraction != 0.0) {
entropy -= fraction * log2f (fraction);
}
return entropy;
}
/* Return a malloc()'ed array containing an infomation-gain score for
each word index. */
float *infogain(bow_wv **docs, int *yvect, int ndocs) {
int grand_totals[2]; /* Totals for each class. */
double total_entropy; /* The entropy of the total collection. */
double with_word_entropy; /* The entropy of the set of docs with
the word in question. */
double without_word_entropy; /* The entropy of the set of docs without
the word in question. */
float grand_total = 0;
float with_word_total = 0;
float without_word_total = 0;
int i, j;
float *ret;
double sum;
int *fc[2]; /* tallies for all the words in class 1 & 2 */
int num_words;
bow_verbosify (bow_progress, "Calculating info gain... words :: ");
num_words = bow_num_words();
ret = bow_malloc (num_words*sizeof (float));
fc[0] = (int *) malloc(num_words*sizeof(double));
fc[1] = (int *) malloc(num_words*sizeof(double));
memset(fc[0], 0, num_words*sizeof(int));
memset(fc[1], 0, num_words*sizeof(int));
/* First set all the arrays to zero */
for(i = 0; i < 2; i++) {
grand_totals[i] = 0;
}
/* Now set up the grand totals. */
for (i = 0; i<ndocs; i++) {
if (yvect[i]) { /* if it is unlabeled, ignore it */
grand_totals[(yvect[i]+1)/2] ++;
/* this is only done incase some type of occurrence cnt should ever happen */
grand_total ++;
}
}
/* Calculate the total entropy */
total_entropy = entropy (grand_totals[0], grand_totals[1]);
sum = 0.0;
/* the fc[...] are like the with_word totals */
for (i=0; i<ndocs; i++) {
if (yvect[i]) {
int y = (yvect[i]+1)/2;
for (j=0; j<docs[i]->num_entries; j++) {
fc[y][docs[i]->entry[j].wi] ++;
}
}
}
for (i=0; i<num_words; i++) {
with_word_total = fc[0][i] + fc[1][i];
without_word_total = grand_total - with_word_total;
with_word_entropy = entropy((float)fc[0][i],(float)fc[1][i]);
without_word_entropy = entropy((float)(grand_totals[0] - fc[0][i]),
(float)(grand_totals[1] - fc[1][i]));
ret[i]=(float) (total_entropy -
(((double)with_word_total/(double)grand_total)*with_word_entropy) -
(((double)without_word_total/(double)grand_total)*without_word_entropy));
assert (ret[i] >= -1e-7);
sum += ret[i];
}
free(fc[0]);
free(fc[1]);
/* "normalize" in similar fashion to tfidf */
for (i=0; i<num_words; i++) {
/* Get the document vector for this word WI */
ret[i] = num_words*ret[i]/sum;
}
bow_verbosify (bow_progress, "\n");
return ret;
}
/* this sets the already transformed weights THEN does the normalizing... */
static void svm_set_barrel_weights(bow_wv **docs, int *yvect, int ndocs, float **weight_vect) {
int i,j;
/* the weights have yet to be set & since that's what we're using... */
if (svm_kernel_type == FISHER) {
svm_set_fisher_barrel_weights(docs, ndocs);
return;
} else if (weight_type == RAW) {
for (i=0; i<ndocs; i++) {
for (j=0; j<docs[i]->num_entries; j++) {
docs[i]->entry[j].weight *= docs[i]->normalizer;
}
}
return;
} else if (weight_type == TFIDF) {
*weight_vect = tfidf(docs, ndocs);
} else if (weight_type == INFOGAIN) {
*weight_vect = infogain(docs, yvect, ndocs);
}
/* Now loop through all the documents, setting their weights */
for (i=0; i<ndocs; i++) {
double sum = 0.0;
for (j=0; j<docs[i]->num_entries; j++) {
docs[i]->entry[j].weight *=
docs[i]->normalizer * (*weight_vect)[docs[i]->entry[j].wi];
sum += docs[i]->entry[j].weight;
}
if (sum >0.0) {
bow_wv_normalize_weights_by_summing(docs[i]);
for (j=0; j<docs[i]->num_entries; j++) {
docs[i]->entry[j].weight *= docs[i]->normalizer;
}
}
}
}
/* similar to barrel weights above, but this only works on 1 wv at a time */