Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jarred/kpca #27

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions gurls/demo/kpca_example_script.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
%%%
%%% Example script for KPCA
%%%

%%
%
% Step 1: Make a data set. This is similar to the 'circles' data set that
% is used in the Wikipedia article for KPCA. I like it, so I will use it
% here.
%
Ncirc = 5;
Nsamp = 100; % Per circle
% Make circles of data
X = zeros(0,2);


for k=1:Ncirc
z = k^2*(0.5+0.2*rand(Nsamp,1)).*exp(2*pi*1i*rand(Nsamp,1));
X = vertcat(X,[real(z) imag(z)]);
end
y = sqrt(sum(X.^2,2));

% (Linearlly) embed the data in a higher dimensional space.
U = orth(randn(50,2)); % Projector
X = X*U';

% Center & normalize
X = bsxfun(@rdivide,X,std(X));
y = y - mean(y);
y = y / std(y);


% Split data into testing and training sets
split = randperm(size(X,1));
M = floor(length(split)/2);
Xtr = X(split(1:M),:);
ytr = y(split(1:M));
Xte = X(split((M+1):end),:);
yte = y(split((M+1):end));

%% Unsupervised: Just do PCA and KPCA

% Setup options. This chain has no RLS in it.
opt = gurls_defopt('test');
opt.preproc.kernel.kernel = 'linear';
opt.preproc.n_dims = 2;
opt.seq = {'preproc:kpca_train','preproc:kpca_run'};
opt.process = {[2,2]};
[~,X_PCA] = gurls(X,[],opt,1);

opt.preproc.kernel.kernel = 'poly';
opt.preproc.kernel.c = 1;
opt.preproc.kernel.order = 2;
[~,X_KPCA] = gurls(X,[],opt,1);

figure(1);
subplot(1,2,1);
scatter(X_PCA(:,1),X_PCA(:,2),10,y);
axis equal;
title 'Data after PCA';
subplot(1,2,2);
scatter(X_KPCA(:,1),X_KPCA(:,2),10,y);
axis equal;
title 'Data after KPCA (poly kernel)';
%% Supervised: Regression model w/ dimensionality selection via crossvalidation
opt = gurls_defopt('test');
opt.preproc.kernel.kernel = 'poly';
opt.preproc.kernel.c = 1;
opt.preproc.kernel.order = 2;
opt.preproc.n_dims = 232; % set to whatever, this is set by cross-validation.
opt.kernel.type = 'linear';
opt.verbose = false;
opt.seq = {'preproc:kpca_train','preproc:kpca_run','rls:primal','pred:primal','perf:rmse'};
opt.process = {[2,2,2,0,0],[3,2,2,2,2]};
opt.newprop('paramsel',struct());
opt.paramsel.lambdas = 1E-5;
cv_params = struct;
cv_params.n_trials = 10;
cv_params.pct_train = 0.75;
cv_params.param_name = 'preproc.n_dims';
cv_params.param_vals = 1:5;%[0 1 2 3 4 5];
cv_params.perf_field = 'rmse';
cv_params.max_or_min = 'min';

preproc_crossvalidate(Xtr,Xte,opt,cv_params);

gurls(Xtr, ytr, opt, 1);
[~,X_out]=gurls(Xte, yte, opt, 2);

figure(2); clf;
scatter(yte,opt.pred);
xlabel 'Truth value';
ylabel 'Model prediction';
title 'Linear regression after PCA';

16 changes: 12 additions & 4 deletions gurls/gurls.m
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
% ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
% POSSIBILITY OF SUCH DAMAGE.

function [opt] = gurls(X, y, opt, jobid)
function [opt,X,y] = gurls(X, y, opt, jobid)



Expand Down Expand Up @@ -120,9 +120,17 @@
fName = [reg{1} '_' reg{2}];
fun = str2func(fName);
tic;
opt.newprop(reg{1}, fun(X,y,opt));
opt.time{jobid} = setfield(opt.time{jobid},reg{1}, toc);

% This is hacky, but I can't figure out how to dynamically get the
% number of output arguments for a function.
if strcmp(reg{1},'preproc')
% Preprocessing code can change the data
[subopt,X,y] = fun(X,y,opt);
else
subopt = fun(X,y,opt);
end
opt.newprop(reg{1}, subopt);
opt.time{jobid} = setfield(opt.time{jobid},reg{1}, toc);
if opt.verbose
fprintf('\tdone\n');
end
Expand Down Expand Up @@ -165,7 +173,7 @@
if opt.verbose
fprintf('\tsaving..\n');
end
otherwise
case DEL
if isprop(opt, reg{1})
opt.(reg{1}) = [];
if opt.verbose
Expand Down
7 changes: 7 additions & 0 deletions gurls/gurls_defopt.m
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@
opt.newprop( 'nystrom', struct());
opt.nystrom.m = 100;

%% Preprocessing / Dimensionality Reduction options
opt.newprop( 'preproc', struct() );
opt.preproc.kernel = struct;
opt.preproc.kernel.kernel = 'linear';
opt.preproc.center_data = true;
opt.preproc.n_dims = 5;

%% opt base
opt.newprop( 'jobid', 1);
opt.newprop( 'seq', {});
Expand Down
73 changes: 73 additions & 0 deletions gurls/preproc/preproc_crossvalidate.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
function [ gurls_chain ] = preproc_crossvalidate( X, y, gurls_chain, param_data)
%PREPROC_CROSSVALIDATE Crossvalidation for preprocessing. This is also
% probably pretty useful for other things since it is architected in a
% general way.

% Inputs:
% X: The data. Nxd
% y: The labels/output. Nx1
% gurls_chain: a GURLS `opt` structure. This needs to be fairly specific:
% (1) it needs to have a `train' process and a `test' process (processes
% 1 and 2, respectively)
% (2) it needs to have a perf: structure
% param_data: Information for the parameter selection
%

% Right now I have random subsample cross validation implemented
n_trials = param_data.n_trials;
pct_train = param_data.pct_train;

% Current implementation only supports one parameter.
param_seq = regexp(param_data.param_name,'\.','split');
param_vals = param_data.param_vals;

n_params = length(param_data.param_vals);
perf_data = zeros(n_trials,n_params);


[N,d] = size(X);
for k_trial=1:n_trials
ix = randperm(N)';
ix_split = fix(N*pct_train);
ix_tr = ix(1:ix_split);
ix_te = ix((ix_split+1):end);
fprintf(' [cross validation trial %d/%d]\n', k_trial,n_trials);
for k_param=1:n_params
% Set the parameter
opt = deep_copy(gurls_chain);
opt = set_param(opt,param_seq,param_vals(k_param));
gurls(X(ix_tr,:),y(ix_tr),opt,1);
gurls(X(ix_te,:),y(ix_te),opt,2);
perf_data(k_trial,k_param) = opt.perf.(param_data.perf_field);
end
end
mean(perf_data)
% Find maximum averaged over trials
switch param_data.max_or_min
case 'max'
[~,arg_opt] = max(mean(perf_data,1));
case 'min'
[~,arg_opt] = min(mean(perf_data,1));
end
best_values = param_vals(arg_opt);
gurls_chain = set_param(gurls_chain,param_seq,best_values);
return;
function C = deep_copy(A)
% This isn't great, but it's the only way to reliably deep copy
% handle objects in MATLAB (that I know of).
tmp = [tempname '.mat'];
save(tmp,'A','-v7.3');
load(tmp);
C = A;
delete(tmp);
end

function opt = set_param(opt,seq,val)
if isempty(seq)
opt = val;
else
opt = setfield(opt,seq{1},set_param(opt.(seq{1}),{seq{2:end}},val));
end
end
end

19 changes: 19 additions & 0 deletions gurls/preproc/preproc_kernel_dispatch.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
function [ K ] = preproc_kernel_dispatch( kernel_opts, X1, X2 )

% There doesn't seem to be any good way of fitting this in the GURLS
% framework. It looks like whoever implemented predkernel_traintest.m
% had to re-implement all of the kernels, so I guess I'll do it here.
switch kernel_opts.kernel
case 'linear'
K = X1*X2';
case 'rbf'
D = square_distance(X1',X2');
K = exp(-0.5*D/kernel_opts.sigma^2);
case 'poly'
K = X1*X2';
K = (K + kernel_opts.c).^kernel_opts.order;
otherwise
error('Only linear, RBF, and polynomial kernels are supported for preprocessing.');
end
end

11 changes: 11 additions & 0 deletions gurls/preproc/preproc_kpca_run.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
function [preproc,X,y] = preproc_kpca_run(X,y,opt)
preproc = opt.preproc;

if strcmp(preproc.kernel.kernel,'linear')
% special case: this makes PCA (with no 'K') faster
X = X*preproc.V;
else
K = preproc_kernel_dispatch(preproc.kernel,preproc.X,X);
X = K'*preproc.V;
end
end
42 changes: 42 additions & 0 deletions gurls/preproc/preproc_kpca_train.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
function [preproc,X,y] = preproc_kpca_train(X,y,opt)
[N,d] = size(X);
preproc = opt.preproc;

if strcmp(preproc.kernel.kernel,'linear')
% For the linear kernel, we just do PCA on the data.
% This is a lot faster and worth having a special case.
if preproc.center_data
mu = mean(X,1); % average along samples
X2 = bsxfun(@minus,X,mu);
K = X2'*X2;
else
K = X'*X;
end
preproc.V = do_eig(K);
else
% KPCA case.
% Get kernel matrix
K = preproc_kernel_dispatch(preproc.kernel,X,X);

% Kernel centering
if preproc.center_data
ONE = ones(N,N)/N;
K = K - ONE*K - K*ONE + ONE*K*ONE;
end

preproc.V = do_eig(K);
preproc.X = X; % Required for computing the kernel on other data
end

function V=do_eig(K)
% Compute top eigvals. Give hints to MATLABs solver to speed
% things up a bit.
eig_opts = struct;
eig_opts.isreal = true;
eig_opts.issym = true;
if preproc.n_dims > size(K,1)
preproc.n_dims = size(K,1)
end
[V,~] = eigs(K/trace(K),preproc.n_dims,'LM',eig_opts);
end
end