forked from szcf-weiya/ESL-CN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
logistic_cg.py
302 lines (236 loc) · 10.1 KB
/
logistic_cg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
"""
This tutorial introduces logistic regression using Theano and conjugate
gradient descent.
Logistic regression is a probabilistic, linear classifier. It is parametrized
by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
done by projecting data points onto a set of hyperplanes, the distance to
which is used to determine a class membership probability.
Mathematically, this can be written as:
.. math::
P(Y=i|x, W,b) &= softmax_i(W x + b) \\
&= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
The output of the model or prediction is then done by taking the argmax of
the vector whose i'th element is P(Y=i|x).
.. math::
y_{pred} = argmax_i P(Y=i|x,W,b)
This tutorial presents a conjugate gradient optimization method that is
suitable for smaller datasets.
References:
- textbooks: "Pattern Recognition and Machine Learning" -
Christopher M. Bishop, section 4.3.2
"""
from __future__ import print_function, division
__docformat__ = 'restructedtext en'
import os
import sys
import timeit
import numpy
import theano
import theano.tensor as T
from logistic_sgd import load_data
class LogisticRegression(object):
"""Multi-class Logistic Regression Class
The logistic regression is fully described by a weight matrix :math:`W`
and bias vector :math:`b`. Classification is done by projecting data
points onto a set of hyperplanes, the distance to which is used to
determine a class membership probability.
"""
def __init__(self, input, n_in, n_out):
""" Initialize the parameters of the logistic regression
:type input: theano.tensor.TensorType
:param input: symbolic variable that describes the input of the
architecture ( one minibatch)
:type n_in: int
:param n_in: number of input units, the dimension of the space in
which the datapoint lies
:type n_out: int
:param n_out: number of output units, the dimension of the space in
which the target lies
"""
# initialize theta = (W,b) with 0s; W gets the shape (n_in, n_out),
# while b is a vector of n_out elements, making theta a vector of
# n_in*n_out + n_out elements
self.theta = theano.shared(
value=numpy.zeros(
n_in * n_out + n_out,
dtype=theano.config.floatX
),
name='theta',
borrow=True
)
# W is represented by the fisr n_in*n_out elements of theta
self.W = self.theta[0:n_in * n_out].reshape((n_in, n_out))
# b is the rest (last n_out elements)
self.b = self.theta[n_in * n_out:n_in * n_out + n_out]
# compute vector of class-membership probabilities in symbolic form
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
# compute prediction as class whose probability is maximal in
# symbolic form
self.y_pred = T.argmax(self.p_y_given_x, axis=1)
# keep track of model input
self.input = input
def negative_log_likelihood(self, y):
"""Return the negative log-likelihood of the prediction of this model
under a given target distribution.
.. math::
\frac{1}{|\mathcal{D}|}\mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
\frac{1}{|\mathcal{D}|}\sum_{i=0}^{|\mathcal{D}|}
\log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
\ell (\theta=\{W,b\}, \mathcal{D})
:type y: theano.tensor.TensorType
:param y: corresponds to a vector that gives for each example the
correct label
"""
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
def errors(self, y):
"""Return a float representing the number of errors in the minibatch
over the total number of examples of the minibatch
:type y: theano.tensor.TensorType
:param y: corresponds to a vector that gives for each example
the correct label
"""
# check if y has same dimension of y_pred
if y.ndim != self.y_pred.ndim:
raise TypeError(
'y should have the same shape as self.y_pred',
('y', y.type, 'y_pred', self.y_pred.type)
)
# check if y is of the correct datatype
if y.dtype.startswith('int'):
# the T.neq operator returns a vector of 0s and 1s, where 1
# represents a mistake in prediction
return T.mean(T.neq(self.y_pred, y))
else:
raise NotImplementedError()
def cg_optimization_mnist(n_epochs=50, mnist_pkl_gz='mnist.pkl.gz'):
"""Demonstrate conjugate gradient optimization of a log-linear model
This is demonstrated on MNIST.
:type n_epochs: int
:param n_epochs: number of epochs to run the optimizer
:type mnist_pkl_gz: string
:param mnist_pkl_gz: the path of the mnist training file from
http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
"""
#############
# LOAD DATA #
#############
datasets = load_data(mnist_pkl_gz)
train_set_x, train_set_y = datasets[0]
valid_set_x, valid_set_y = datasets[1]
test_set_x, test_set_y = datasets[2]
batch_size = 600 # size of the minibatch
n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size
n_in = 28 * 28 # number of input units
n_out = 10 # number of output units
######################
# BUILD ACTUAL MODEL #
######################
print('... building the model')
# allocate symbolic variables for the data
minibatch_offset = T.lscalar() # offset to the start of a [mini]batch
x = T.matrix() # the data is presented as rasterized images
y = T.ivector() # the labels are presented as 1D vector of
# [int] labels
# construct the logistic regression class
classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)
# the cost we minimize during training is the negative log likelihood of
# the model in symbolic format
cost = classifier.negative_log_likelihood(y).mean()
# compile a theano function that computes the mistakes that are made by
# the model on a minibatch
test_model = theano.function(
[minibatch_offset],
classifier.errors(y),
givens={
x: test_set_x[minibatch_offset:minibatch_offset + batch_size],
y: test_set_y[minibatch_offset:minibatch_offset + batch_size]
},
name="test"
)
validate_model = theano.function(
[minibatch_offset],
classifier.errors(y),
givens={
x: valid_set_x[minibatch_offset: minibatch_offset + batch_size],
y: valid_set_y[minibatch_offset: minibatch_offset + batch_size]
},
name="validate"
)
# compile a theano function that returns the cost of a minibatch
batch_cost = theano.function(
[minibatch_offset],
cost,
givens={
x: train_set_x[minibatch_offset: minibatch_offset + batch_size],
y: train_set_y[minibatch_offset: minibatch_offset + batch_size]
},
name="batch_cost"
)
# compile a theano function that returns the gradient of the minibatch
# with respect to theta
batch_grad = theano.function(
[minibatch_offset],
T.grad(cost, classifier.theta),
givens={
x: train_set_x[minibatch_offset: minibatch_offset + batch_size],
y: train_set_y[minibatch_offset: minibatch_offset + batch_size]
},
name="batch_grad"
)
# creates a function that computes the average cost on the training set
def train_fn(theta_value):
classifier.theta.set_value(theta_value, borrow=True)
train_losses = [batch_cost(i * batch_size)
for i in range(n_train_batches)]
return numpy.mean(train_losses)
# creates a function that computes the average gradient of cost with
# respect to theta
def train_fn_grad(theta_value):
classifier.theta.set_value(theta_value, borrow=True)
grad = batch_grad(0)
for i in range(1, n_train_batches):
grad += batch_grad(i * batch_size)
return grad / n_train_batches
validation_scores = [numpy.inf, 0]
# creates the validation function
def callback(theta_value):
classifier.theta.set_value(theta_value, borrow=True)
#compute the validation loss
validation_losses = [validate_model(i * batch_size)
for i in range(n_valid_batches)]
this_validation_loss = numpy.mean(validation_losses)
print(('validation error %f %%' % (this_validation_loss * 100.,)))
# check if it is better then best validation score got until now
if this_validation_loss < validation_scores[0]:
# if so, replace the old one, and compute the score on the
# testing dataset
validation_scores[0] = this_validation_loss
test_losses = [test_model(i * batch_size)
for i in range(n_test_batches)]
validation_scores[1] = numpy.mean(test_losses)
###############
# TRAIN MODEL #
###############
# using scipy conjugate gradient optimizer
import scipy.optimize
print ("Optimizing using scipy.optimize.fmin_cg...")
start_time = timeit.default_timer()
best_w_b = scipy.optimize.fmin_cg(
f=train_fn,
x0=numpy.zeros((n_in + 1) * n_out, dtype=x.dtype),
fprime=train_fn_grad,
callback=callback,
disp=0,
maxiter=n_epochs
)
end_time = timeit.default_timer()
print(('Optimization complete with best validation score of %f %%, with '
'test performance %f %%'
) % (validation_scores[0] * 100., validation_scores[1] * 100.)
)
print('The code for file ' + os.path.split(__file__)[1] +
' ran for %.1fs' % (end_time - start_time), file=sys.stderr)
if __name__ == '__main__':
cg_optimization_mnist()