LCSL · jarredbarber · Dec 16, 2015 · Dec 17, 2015 · Dec 17, 2015 · Dec 17, 2015
diff --git a/gurls/demo/kpca_example_script.m b/gurls/demo/kpca_example_script.m
@@ -0,0 +1,95 @@
+%%%
+%%% Example script for KPCA
+%%%
+
+%%
+%
+% Step 1: Make a data set.  This is similar to the 'circles' data set that
+%  is used in the Wikipedia article for KPCA. I like it, so I will use it
+%  here.
+%
+Ncirc = 5;
+Nsamp = 100; % Per circle
+% Make circles of data
+X = zeros(0,2);
+
+
+for k=1:Ncirc
+    z = k^2*(0.5+0.2*rand(Nsamp,1)).*exp(2*pi*1i*rand(Nsamp,1));
+    X = vertcat(X,[real(z) imag(z)]);
+end
+y = sqrt(sum(X.^2,2));
+
+% (Linearlly) embed the data in a higher dimensional space.
+U = orth(randn(50,2)); % Projector
+X = X*U';
+
+% Center & normalize
+X = bsxfun(@rdivide,X,std(X));
+y = y - mean(y);
+y = y / std(y);
+
+
+% Split data into testing and training sets
+split = randperm(size(X,1));
+M = floor(length(split)/2);
+Xtr = X(split(1:M),:);
+ytr = y(split(1:M));
+Xte = X(split((M+1):end),:);
+yte = y(split((M+1):end));
+
+%% Unsupervised: Just do PCA and KPCA
+
+% Setup options. This chain has no RLS in it.
+opt = gurls_defopt('test');
+opt.preproc.kernel.kernel = 'linear';
+opt.preproc.n_dims = 2;
+opt.seq = {'preproc:kpca_train','preproc:kpca_run'};
+opt.process = {[2,2]};
+[~,X_PCA] = gurls(X,[],opt,1);
+
+opt.preproc.kernel.kernel = 'poly';
+opt.preproc.kernel.c = 1;
+opt.preproc.kernel.order = 2;
+[~,X_KPCA] = gurls(X,[],opt,1);
+
+figure(1);
+subplot(1,2,1);
+scatter(X_PCA(:,1),X_PCA(:,2),10,y);
+axis equal;
+title 'Data after PCA';
+subplot(1,2,2);
+scatter(X_KPCA(:,1),X_KPCA(:,2),10,y);
+axis equal;
+title 'Data after KPCA (poly kernel)';
+%% Supervised: Regression model w/ dimensionality selection via crossvalidation
+opt = gurls_defopt('test');
+opt.preproc.kernel.kernel = 'poly';
+opt.preproc.kernel.c = 1;
+opt.preproc.kernel.order = 2;
+opt.preproc.n_dims = 232; % set to whatever, this is set by cross-validation.
+opt.kernel.type = 'linear';
+opt.verbose = false;
+opt.seq = {'preproc:kpca_train','preproc:kpca_run','rls:primal','pred:primal','perf:rmse'};
+opt.process = {[2,2,2,0,0],[3,2,2,2,2]};
+opt.newprop('paramsel',struct());
+opt.paramsel.lambdas = 1E-5;
+cv_params = struct;
+cv_params.n_trials = 10;
+cv_params.pct_train = 0.75;
+cv_params.param_name = 'preproc.n_dims';
+cv_params.param_vals = 1:5;%[0 1 2 3 4 5];
+cv_params.perf_field = 'rmse';
+cv_params.max_or_min = 'min';
+
+preproc_crossvalidate(Xtr,Xte,opt,cv_params);
+
+gurls(Xtr, ytr, opt, 1);
+[~,X_out]=gurls(Xte, yte, opt, 2);
+
+figure(2); clf;
+scatter(yte,opt.pred);
+xlabel 'Truth value';
+ylabel 'Model prediction';
+title 'Linear regression after PCA';
+
diff --git a/gurls/gurls.m b/gurls/gurls.m
@@ -32,7 +32,7 @@
 % ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 % POSSIBILITY OF SUCH DAMAGE.
 
-function [opt] = gurls(X, y, opt, jobid)
+function [opt,X,y] = gurls(X, y, opt, jobid)
 
 
 
@@ -120,9 +120,17 @@
 		fName = [reg{1} '_' reg{2}];
 		fun = str2func(fName);
 		tic;
-        opt.newprop(reg{1}, fun(X,y,opt));
-		opt.time{jobid} = setfield(opt.time{jobid},reg{1}, toc);
 
+        % This is hacky, but I can't figure out how to dynamically get the
+        % number of output arguments for a function.
+        if strcmp(reg{1},'preproc')
+            % Preprocessing code can change the data
+            [subopt,X,y] = fun(X,y,opt);
+        else
+            subopt = fun(X,y,opt);
+        end
+        opt.newprop(reg{1}, subopt);
+		opt.time{jobid} = setfield(opt.time{jobid},reg{1}, toc);
         if opt.verbose
     		fprintf('\tdone\n');
         end
@@ -165,7 +173,7 @@
                 if opt.verbose
                     fprintf('\tsaving..\n');
                 end
-            otherwise
+            case DEL
                 if isprop(opt, reg{1})
                     opt.(reg{1}) = [];
                     if opt.verbose

diff --git a/gurls/gurls_defopt.m b/gurls/gurls_defopt.m
@@ -72,6 +72,13 @@
     opt.newprop( 'nystrom', struct());
     opt.nystrom.m = 100;
 
+    %% Preprocessing / Dimensionality Reduction options
+    opt.newprop( 'preproc', struct() );
+    opt.preproc.kernel = struct;
+    opt.preproc.kernel.kernel = 'linear';
+    opt.preproc.center_data = true;
+    opt.preproc.n_dims = 5;
+
     %% opt base
     opt.newprop( 'jobid', 1);
     opt.newprop( 'seq', {});

diff --git a/gurls/preproc/preproc_crossvalidate.m b/gurls/preproc/preproc_crossvalidate.m
@@ -0,0 +1,73 @@
+function [ gurls_chain ] = preproc_crossvalidate( X, y, gurls_chain, param_data)
+%PREPROC_CROSSVALIDATE Crossvalidation for preprocessing.  This is also
+%  probably pretty useful for other things since it is architected in a
+%  general way.
+
+% Inputs: 
+%  X: The data. Nxd
+%  y: The labels/output. Nx1
+%  gurls_chain: a GURLS `opt` structure. This needs to be fairly specific:
+%   (1) it needs to have a `train' process and a `test' process (processes
+%   1 and 2, respectively)
+%   (2) it needs to have a perf: structure
+% param_data: Information for the parameter selection
+%
+
+% Right now I have random subsample cross validation implemented
+n_trials = param_data.n_trials;
+pct_train = param_data.pct_train;
+
+% Current implementation only supports one parameter.
+param_seq = regexp(param_data.param_name,'\.','split');
+param_vals = param_data.param_vals;
+
+n_params = length(param_data.param_vals);
+perf_data = zeros(n_trials,n_params);
+
+
+[N,d] = size(X);
+for k_trial=1:n_trials
+    ix = randperm(N)';
+    ix_split = fix(N*pct_train);
+    ix_tr = ix(1:ix_split);
+    ix_te = ix((ix_split+1):end);
+    fprintf('  [cross validation trial %d/%d]\n', k_trial,n_trials);
+    for k_param=1:n_params
+        % Set the parameter
+        opt = deep_copy(gurls_chain);
+        opt = set_param(opt,param_seq,param_vals(k_param));
+        gurls(X(ix_tr,:),y(ix_tr),opt,1);
+        gurls(X(ix_te,:),y(ix_te),opt,2);
+        perf_data(k_trial,k_param) = opt.perf.(param_data.perf_field);
+    end
+end
+mean(perf_data)
+% Find maximum averaged over trials
+switch param_data.max_or_min
+    case 'max'
+    [~,arg_opt] = max(mean(perf_data,1));
+    case 'min'
+    [~,arg_opt] = min(mean(perf_data,1));
+end
+best_values = param_vals(arg_opt);
+gurls_chain = set_param(gurls_chain,param_seq,best_values);
+return;
+    function C = deep_copy(A)
+        % This isn't great, but it's the only way to reliably deep copy
+        % handle objects in MATLAB (that I know of).
+        tmp = [tempname '.mat'];
+        save(tmp,'A','-v7.3');
+        load(tmp);
+        C = A;
+        delete(tmp);
+    end
+
+    function opt = set_param(opt,seq,val)
+        if isempty(seq)
+            opt = val;
+        else
+            opt = setfield(opt,seq{1},set_param(opt.(seq{1}),{seq{2:end}},val));
+        end
+    end
+end
+
diff --git a/gurls/preproc/preproc_kernel_dispatch.m b/gurls/preproc/preproc_kernel_dispatch.m
@@ -0,0 +1,19 @@
+function [ K ] = preproc_kernel_dispatch( kernel_opts, X1, X2 )
+
+    % There doesn't seem to be any good way of fitting this in the GURLS
+    % framework.  It looks like whoever implemented predkernel_traintest.m
+    % had to re-implement all of the kernels, so I guess I'll do it here.
+    switch kernel_opts.kernel
+        case 'linear'
+            K = X1*X2';
+        case 'rbf'
+            D = square_distance(X1',X2'); 
+            K = exp(-0.5*D/kernel_opts.sigma^2);
+        case 'poly'
+            K = X1*X2';
+            K = (K + kernel_opts.c).^kernel_opts.order;
+        otherwise
+            error('Only linear, RBF, and polynomial kernels are supported for preprocessing.');
+    end
+end
+
diff --git a/gurls/preproc/preproc_kpca_run.m b/gurls/preproc/preproc_kpca_run.m
@@ -0,0 +1,11 @@
+function [preproc,X,y] = preproc_kpca_run(X,y,opt)
+    preproc = opt.preproc;
+
+    if strcmp(preproc.kernel.kernel,'linear')
+        % special case: this makes PCA (with no 'K') faster
+        X = X*preproc.V;
+    else
+        K = preproc_kernel_dispatch(preproc.kernel,preproc.X,X);
+        X = K'*preproc.V;
+    end
+end
diff --git a/gurls/preproc/preproc_kpca_train.m b/gurls/preproc/preproc_kpca_train.m
@@ -0,0 +1,42 @@
+function [preproc,X,y] = preproc_kpca_train(X,y,opt)
+    [N,d] = size(X);
+    preproc = opt.preproc;
+
+    if strcmp(preproc.kernel.kernel,'linear')
+        % For the linear kernel, we just do PCA on the data.
+        % This is a lot faster and worth having a special case.
+        if preproc.center_data
+            mu = mean(X,1); % average along samples
+            X2 = bsxfun(@minus,X,mu);
+            K = X2'*X2;
+        else
+            K = X'*X;
+        end
+        preproc.V = do_eig(K);
+    else
+        % KPCA case.
+        % Get kernel matrix
+        K = preproc_kernel_dispatch(preproc.kernel,X,X);
+
+        % Kernel centering
+        if preproc.center_data
+            ONE = ones(N,N)/N;
+            K = K - ONE*K - K*ONE + ONE*K*ONE;
+        end
+
+        preproc.V = do_eig(K);
+        preproc.X = X; % Required for computing the kernel on other data
+    end
+
+    function V=do_eig(K)
+         % Compute top eigvals. Give hints to MATLABs solver to speed
+         % things up a bit.
+        eig_opts = struct;
+        eig_opts.isreal = true;
+        eig_opts.issym = true;
+        if preproc.n_dims > size(K,1)
+            preproc.n_dims = size(K,1)
+        end
+        [V,~] = eigs(K/trace(K),preproc.n_dims,'LM',eig_opts);
+    end
+end