forked from fizx/libbow-osx
-
Notifications
You must be signed in to change notification settings - Fork 0
/
normalize.c
203 lines (160 loc) · 5.65 KB
/
normalize.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
/* Functions for normalizing weights in a bow_barrel */
/* Copyright (C) 1997, 1998 Andrew McCallum
Written by: Andrew Kachites McCallum <[email protected]>
This file is part of the Bag-Of-Words Library, `libbow'.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public License
as published by the Free Software Foundation, version 2.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */
#include <bow/libbow.h>
#if !HAVE_SQRTF
#define sqrtf sqrt
#endif
static float
_accumulate_for_vector_length (float total_so_far, float increment)
{
return (total_so_far + (increment * increment));
}
static float
_finalize_for_vector_length (float total)
{
return (1.0 / sqrtf (total));
}
static float
_accumulate_for_summing (float total_so_far, float increment)
{
return (total_so_far + increment);
}
static float
_finalize_for_summing (float total)
{
return (1.0 / total);
}
/* Calculate the normalizing factor by which each weight should be
multiplied. Store it in each cdoc->normalizer. */
static void
_bow_barrel_normalize_weights (bow_barrel *barrel,
float (*accumulator)(float, float),
float (*finalizer)(float))
{
int current_di; /* the index of the document for which
we are currently normalizing the
"word vector". */
float norm_total; /* the length of the word vector */
float weight; /* the weight of a single wi/di entry */
bow_dv_heap *heap; /* a heap of "document vectors" */
bow_cdoc *cdoc; /* The document we're working on */
assert (barrel);
heap = bow_make_dv_heap_from_wi2dvf (barrel->wi2dvf);
bow_verbosify (bow_progress, "Normalizing weights: ");
/* Keep going until the heap is empty */
while (heap->length > 0)
{
/* Set the current document we're working on */
current_di = heap->entry[0].current_di;
assert (heap->entry[0].dv->idf == heap->entry[0].dv->idf); /* NaN */
if (current_di % 10 == 0)
bow_verbosify (bow_progress, "\b\b\b\b\b\b%6d", current_di);
/* Here we should check if this di is part of some training set and
move on if it isn't. */
/* Get the document */
cdoc = bow_cdocs_di2doc (barrel->cdocs, current_di);
/* If it's not a model document, then move on to next one */
if (cdoc->type != bow_doc_train)
{
do
{
bow_dv_heap_update (heap);
}
while ((current_di == heap->entry[0].current_di)
&& (heap->length > 0));
/* Try again */
continue;
}
/* Reset the length */
norm_total = 0.0;
/* Loop over all words in this document, summing up the score */
do
{
weight = heap->entry[0].dv->entry[heap->entry[0].index].weight;
norm_total = (*accumulator)(norm_total, weight);
/* Update the heap, we are done with this di, move it to its
new position */
bow_dv_heap_update (heap);
}
while ((current_di == heap->entry[0].current_di)
&& (heap->length > 0));
/* xxx Why isn't this always true? -am */
/* assert (norm_total != 0); */
/* Do final processing of, and store the result. */
cdoc->normalizer = (*finalizer)(norm_total);
}
/* xxx We could actually re-set the weights using the normalizer now
and avoid storing the normalizer. This would be easier than
figuring out the normalizer, because we don't have to use the heap
again, we can just loop through all the WI's and DVI's. */
bow_free (heap);
bow_verbosify (bow_progress, "\n");
}
/* Normalize the weight-vector for each class (or document) such that
all vectors have Euclidean length 1. */
void
bow_barrel_normalize_weights_by_vector_length (bow_barrel *barrel)
{
_bow_barrel_normalize_weights (barrel,
_accumulate_for_vector_length,
_finalize_for_vector_length);
}
/* Normalize the weight-vector for each class (or document) such that
in all vectors, the elements of the vector sum to 1. */
void
bow_barrel_normalize_weights_by_summing (bow_barrel *barrel)
{
_bow_barrel_normalize_weights (barrel,
_accumulate_for_summing,
_finalize_for_summing);
}
/* Assign a value to the "word vector's" NORMALIZER field, such that
when all the weights in the vector are multiplied by the
NORMALIZER, the Euclidian length of the vector will be one. */
void
bow_wv_normalize_weights_by_vector_length (bow_wv *wv)
{
float total = 0.0f;
int wvi;
if (wv->num_entries == 0)
{
wv->normalizer = 0;
return;
}
for (wvi = 0; wvi < wv->num_entries; wvi++)
total += wv->entry[wvi].weight * wv->entry[wvi].weight;
if (total == 0)
bow_error ("You forgot to set the weights before normalizing the WV.");
wv->normalizer = 1.0 / sqrtf (total);
}
/* Assign a value to the "word vector's" NORMALIZER field, such that
when all the weights in the vector are multiplied by the
NORMALIZER, all the vector entries will to one. */
void
bow_wv_normalize_weights_by_summing (bow_wv *wv)
{
float total = 0.0f;
int wvi;
if (wv->num_entries == 0)
{
wv->normalizer = 0;
return;
}
for (wvi = 0; wvi < wv->num_entries; wvi++)
total += wv->entry[wvi].weight;
if (total == 0)
bow_error ("You forgot to set the weights before normalizing the WV.");
wv->normalizer = 1.0 / total;
}