From 64e0d5aecb592e91e462c761697d69d28ad06b1b Mon Sep 17 00:00:00 2001 From: Jarred Barber Date: Tue, 15 Dec 2015 19:49:26 -0500 Subject: [PATCH 1/4] Initial commit. --- gurls/gurls.m | 14 +++++++-- gurls/gurls_defopt.m | 7 +++++ gurls/preproc/preproc_kernel_dispatch.m | 21 +++++++++++++ gurls/preproc/preproc_kpca_run.m | 11 +++++++ gurls/preproc/preproc_kpca_train.m | 41 +++++++++++++++++++++++++ 5 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 gurls/preproc/preproc_kernel_dispatch.m create mode 100644 gurls/preproc/preproc_kpca_run.m create mode 100644 gurls/preproc/preproc_kpca_train.m diff --git a/gurls/gurls.m b/gurls/gurls.m index 7976ffd..516f0cc 100644 --- a/gurls/gurls.m +++ b/gurls/gurls.m @@ -120,9 +120,17 @@ fName = [reg{1} '_' reg{2}]; fun = str2func(fName); tic; - opt.newprop(reg{1}, fun(X,y,opt)); - opt.time{jobid} = setfield(opt.time{jobid},reg{1}, toc); + % This is hacky, but I can't figure out how to dynamically get the + % number of output arguments for a function. + if strcmp(reg{1},'preproc') + % Preprocessing code can change the data + [subopt,X,y] = fun(X,y,opt); + else + subopt = fun(X,y,opt); + end + opt.newprop(reg{1}, subopt); + opt.time{jobid} = setfield(opt.time{jobid},reg{1}, toc); if opt.verbose fprintf('\tdone\n'); end @@ -165,7 +173,7 @@ if opt.verbose fprintf('\tsaving..\n'); end - otherwise + case DEL if isprop(opt, reg{1}) opt.(reg{1}) = []; if opt.verbose diff --git a/gurls/gurls_defopt.m b/gurls/gurls_defopt.m index 97c80d5..00617e6 100644 --- a/gurls/gurls_defopt.m +++ b/gurls/gurls_defopt.m @@ -72,6 +72,13 @@ opt.newprop( 'nystrom', struct()); opt.nystrom.m = 100; + %% Preprocessing / Dimensionality Reduction options + opt.newprop( 'preproc', struct() ); + opt.preproc.kernel = struct; + opt.preproc.kernel.kernel = 'linear'; + opt.preproc.center_data = true; + opt.preproc.n_dims = 5; + %% opt base opt.newprop( 'jobid', 1); opt.newprop( 'seq', {}); diff --git a/gurls/preproc/preproc_kernel_dispatch.m b/gurls/preproc/preproc_kernel_dispatch.m new file mode 100644 index 0000000..cc87b64 --- /dev/null +++ b/gurls/preproc/preproc_kernel_dispatch.m @@ -0,0 +1,21 @@ +function [ K ] = preproc_kernel_dispatch( kernel_opts, X1, X2 ) + + % There doesn't seem to be any good way of fitting this in the GURLS + % framework. It looks like whoever implemented predkernel_traintest.m + % had to re-implement all of the kernels, so I guess I'll do it here. + size(X1) + size(X2) + switch kernel_opts.kernel + case 'linear' + K = X1*X2'; + case 'rbf' + D = square_distance(X1',X2'); + K = exp(-0.5*D/kernel_opts.sigma^2); + case 'poly' + K = X1*X2'; + K = (K + kernel_opts.c).^kernel_opts.order; + otherwise + error('Only linear, RBF, and polynomial kernels are supported for preprocessing.'); + end +end + diff --git a/gurls/preproc/preproc_kpca_run.m b/gurls/preproc/preproc_kpca_run.m new file mode 100644 index 0000000..8595c76 --- /dev/null +++ b/gurls/preproc/preproc_kpca_run.m @@ -0,0 +1,11 @@ +function [preproc,X,y] = preproc_kpca_run(X,y,opt) + preproc = opt.preproc; + + if strcmp(preproc.kernel.kernel,'linear') + % special case: this makes PCA (with no 'K') faster + X = X*preproc.V; + else + K = preproc_kernel_dispatch(preproc.kernel,preproc.X,X); + X = K*preproc.V; + end +end \ No newline at end of file diff --git a/gurls/preproc/preproc_kpca_train.m b/gurls/preproc/preproc_kpca_train.m new file mode 100644 index 0000000..22846e5 --- /dev/null +++ b/gurls/preproc/preproc_kpca_train.m @@ -0,0 +1,41 @@ +function [preproc,X,y] = preproc_kpca_train(X,y,opt) + [N,d] = size(X); + preproc = opt.preproc; + + if strcmp(preproc.kernel.kernel,'linear') + if preproc.center_data + mu = mean(X,1); % average along samples + X2 = bsxfun(@minus,X,mu); + K = X2*X2'; + else + K = X'*X; + end + preproc.V = do_eig(K); + else + % Kernel matrix + K = preproc_kernel_dispatch(preproc.kernel,X,X); + + % Kernel centering + if preproc.center_data + ONE = ones(N,N)/N; + K = K - ONE*K - K*ONE + ONE*K*ONE; + end + K = K/trace(K); % Normalize eigenvalues + + + preproc.V = do_eig(K); + preproc.X = X; % Required for computing the kernel on other data + end + + function V=do_eig(K) + % Compute top eigvals. Give hints to MATLABs solver to speed + % things up a bit. + eig_opts = struct; + eig_opts.isreal = true; + eig_opts.issym = true; + if preproc.n_dims > size(K,1) + preproc.n_dims = size(K,1) + end + [V,~] = eigs(K,preproc.n_dims,'LM',eig_opts); + end +end \ No newline at end of file From 3c69aff67b5611f0c9f2c51700b61fde169eaf32 Mon Sep 17 00:00:00 2001 From: Jarred Barber Date: Wed, 16 Dec 2015 19:09:48 -0500 Subject: [PATCH 2/4] Updates --- gurls/gurls.m | 2 +- gurls/preproc/preproc_kernel_dispatch.m | 2 -- gurls/preproc/preproc_kpca_run.m | 2 +- gurls/preproc/preproc_kpca_train.m | 13 +++++++------ 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/gurls/gurls.m b/gurls/gurls.m index 516f0cc..07e1dd6 100644 --- a/gurls/gurls.m +++ b/gurls/gurls.m @@ -32,7 +32,7 @@ % ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE % POSSIBILITY OF SUCH DAMAGE. -function [opt] = gurls(X, y, opt, jobid) +function [opt,X,y] = gurls(X, y, opt, jobid) diff --git a/gurls/preproc/preproc_kernel_dispatch.m b/gurls/preproc/preproc_kernel_dispatch.m index cc87b64..37ef89e 100644 --- a/gurls/preproc/preproc_kernel_dispatch.m +++ b/gurls/preproc/preproc_kernel_dispatch.m @@ -3,8 +3,6 @@ % There doesn't seem to be any good way of fitting this in the GURLS % framework. It looks like whoever implemented predkernel_traintest.m % had to re-implement all of the kernels, so I guess I'll do it here. - size(X1) - size(X2) switch kernel_opts.kernel case 'linear' K = X1*X2'; diff --git a/gurls/preproc/preproc_kpca_run.m b/gurls/preproc/preproc_kpca_run.m index 8595c76..7aa3cd6 100644 --- a/gurls/preproc/preproc_kpca_run.m +++ b/gurls/preproc/preproc_kpca_run.m @@ -6,6 +6,6 @@ X = X*preproc.V; else K = preproc_kernel_dispatch(preproc.kernel,preproc.X,X); - X = K*preproc.V; + X = K'*preproc.V; end end \ No newline at end of file diff --git a/gurls/preproc/preproc_kpca_train.m b/gurls/preproc/preproc_kpca_train.m index 22846e5..8ffd0b8 100644 --- a/gurls/preproc/preproc_kpca_train.m +++ b/gurls/preproc/preproc_kpca_train.m @@ -3,16 +3,19 @@ preproc = opt.preproc; if strcmp(preproc.kernel.kernel,'linear') + % For the linear kernel, we just do PCA on the data. + % This is a lot faster and worth having a special case. if preproc.center_data mu = mean(X,1); % average along samples X2 = bsxfun(@minus,X,mu); - K = X2*X2'; + K = X2'*X2; else K = X'*X; end preproc.V = do_eig(K); else - % Kernel matrix + % KPCA case. + % Get kernel matrix K = preproc_kernel_dispatch(preproc.kernel,X,X); % Kernel centering @@ -20,9 +23,7 @@ ONE = ones(N,N)/N; K = K - ONE*K - K*ONE + ONE*K*ONE; end - K = K/trace(K); % Normalize eigenvalues - - + preproc.V = do_eig(K); preproc.X = X; % Required for computing the kernel on other data end @@ -36,6 +37,6 @@ if preproc.n_dims > size(K,1) preproc.n_dims = size(K,1) end - [V,~] = eigs(K,preproc.n_dims,'LM',eig_opts); + [V,~] = eigs(K/trace(K),preproc.n_dims,'LM',eig_opts); end end \ No newline at end of file From c609863fb53955c184c636e0ca168b9b21d9ecca Mon Sep 17 00:00:00 2001 From: Jarred Barber Date: Wed, 16 Dec 2015 19:10:06 -0500 Subject: [PATCH 3/4] Initial crossvaligdation commit --- gurls/preproc/preproc_crossvalidate.m | 73 +++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 gurls/preproc/preproc_crossvalidate.m diff --git a/gurls/preproc/preproc_crossvalidate.m b/gurls/preproc/preproc_crossvalidate.m new file mode 100644 index 0000000..51afe7a --- /dev/null +++ b/gurls/preproc/preproc_crossvalidate.m @@ -0,0 +1,73 @@ +function [ gurls_chain ] = preproc_crossvalidate( X, y, gurls_chain, param_data) +%PREPROC_CROSSVALIDATE Crossvalidation for preprocessing. This is also +% probably pretty useful for other things since it is architected in a +% general way. + +% Inputs: +% X: The data. Nxd +% y: The labels/output. Nx1 +% gurls_chain: a GURLS `opt` structure. This needs to be fairly specific: +% (1) it needs to have a `train' process and a `test' process (processes +% 1 and 2, respectively) +% (2) it needs to have a perf: structure +% param_data: Information for the parameter selection +% + +% Right now I have random subsample cross validation implemented +n_trials = param_data.n_trials; +pct_train = param_data.pct_train; + +% Current implementation only supports one parameter. +param_seq = regexp(param_data.param_name,'\.','split'); +param_vals = param_data.param_vals; + +n_params = length(param_data.param_vals); +perf_data = zeros(n_trials,n_params); + + +[N,d] = size(X); +for k_trial=1:n_trials + ix = randperm(N)'; + ix_split = fix(N*pct_train); + ix_tr = ix(1:ix_split); + ix_te = ix((ix_split+1):end); + fprintf(' [cross validation trial %d/%d]\n', k_trial,n_trials); + for k_param=1:n_params + % Set the parameter + opt = deep_copy(gurls_chain); + opt = set_param(opt,param_seq,param_vals(k_param)); + gurls(X(ix_tr,:),y(ix_tr),opt,1); + gurls(X(ix_te,:),y(ix_te),opt,2); + perf_data(k_trial,k_param) = opt.perf.(param_data.perf_field); + end +end +mean(perf_data) +% Find maximum averaged over trials +switch param_data.max_or_min + case 'max' + [~,arg_opt] = max(mean(perf_data,1)); + case 'min' + [~,arg_opt] = min(mean(perf_data,1)); +end +best_values = param_vals(arg_opt); +gurls_chain = set_param(gurls_chain,param_seq,best_values); +return; + function C = deep_copy(A) + % This isn't great, but it's the only way to reliably deep copy + % handle objects in MATLAB (that I know of). + tmp = [tempname '.mat']; + save(tmp,'A','-v7.3'); + load(tmp); + C = A; + delete(tmp); + end + + function opt = set_param(opt,seq,val) + if isempty(seq) + opt = val; + else + opt = setfield(opt,seq{1},set_param(opt.(seq{1}),{seq{2:end}},val)); + end + end +end + From b02c817a791b14ee3383b0de19747a536aeff188 Mon Sep 17 00:00:00 2001 From: Jarred Barber Date: Wed, 16 Dec 2015 21:18:53 -0500 Subject: [PATCH 4/4] Checking in example script --- gurls/demo/kpca_example_script.m | 95 ++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 gurls/demo/kpca_example_script.m diff --git a/gurls/demo/kpca_example_script.m b/gurls/demo/kpca_example_script.m new file mode 100644 index 0000000..59f02c1 --- /dev/null +++ b/gurls/demo/kpca_example_script.m @@ -0,0 +1,95 @@ +%%% +%%% Example script for KPCA +%%% + +%% +% +% Step 1: Make a data set. This is similar to the 'circles' data set that +% is used in the Wikipedia article for KPCA. I like it, so I will use it +% here. +% +Ncirc = 5; +Nsamp = 100; % Per circle +% Make circles of data +X = zeros(0,2); + + +for k=1:Ncirc + z = k^2*(0.5+0.2*rand(Nsamp,1)).*exp(2*pi*1i*rand(Nsamp,1)); + X = vertcat(X,[real(z) imag(z)]); +end +y = sqrt(sum(X.^2,2)); + +% (Linearlly) embed the data in a higher dimensional space. +U = orth(randn(50,2)); % Projector +X = X*U'; + +% Center & normalize +X = bsxfun(@rdivide,X,std(X)); +y = y - mean(y); +y = y / std(y); + + +% Split data into testing and training sets +split = randperm(size(X,1)); +M = floor(length(split)/2); +Xtr = X(split(1:M),:); +ytr = y(split(1:M)); +Xte = X(split((M+1):end),:); +yte = y(split((M+1):end)); + +%% Unsupervised: Just do PCA and KPCA + +% Setup options. This chain has no RLS in it. +opt = gurls_defopt('test'); +opt.preproc.kernel.kernel = 'linear'; +opt.preproc.n_dims = 2; +opt.seq = {'preproc:kpca_train','preproc:kpca_run'}; +opt.process = {[2,2]}; +[~,X_PCA] = gurls(X,[],opt,1); + +opt.preproc.kernel.kernel = 'poly'; +opt.preproc.kernel.c = 1; +opt.preproc.kernel.order = 2; +[~,X_KPCA] = gurls(X,[],opt,1); + +figure(1); +subplot(1,2,1); +scatter(X_PCA(:,1),X_PCA(:,2),10,y); +axis equal; +title 'Data after PCA'; +subplot(1,2,2); +scatter(X_KPCA(:,1),X_KPCA(:,2),10,y); +axis equal; +title 'Data after KPCA (poly kernel)'; +%% Supervised: Regression model w/ dimensionality selection via crossvalidation +opt = gurls_defopt('test'); +opt.preproc.kernel.kernel = 'poly'; +opt.preproc.kernel.c = 1; +opt.preproc.kernel.order = 2; +opt.preproc.n_dims = 232; % set to whatever, this is set by cross-validation. +opt.kernel.type = 'linear'; +opt.verbose = false; +opt.seq = {'preproc:kpca_train','preproc:kpca_run','rls:primal','pred:primal','perf:rmse'}; +opt.process = {[2,2,2,0,0],[3,2,2,2,2]}; +opt.newprop('paramsel',struct()); +opt.paramsel.lambdas = 1E-5; +cv_params = struct; +cv_params.n_trials = 10; +cv_params.pct_train = 0.75; +cv_params.param_name = 'preproc.n_dims'; +cv_params.param_vals = 1:5;%[0 1 2 3 4 5]; +cv_params.perf_field = 'rmse'; +cv_params.max_or_min = 'min'; + +preproc_crossvalidate(Xtr,Xte,opt,cv_params); + +gurls(Xtr, ytr, opt, 1); +[~,X_out]=gurls(Xte, yte, opt, 2); + +figure(2); clf; +scatter(yte,opt.pred); +xlabel 'Truth value'; +ylabel 'Model prediction'; +title 'Linear regression after PCA'; +