You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

765 lines
22 KiB

% kmeans_tests.m
%
% David Rowe June 2017
%
%
% Trying a few variations on the kmeans algorithm for quantisation of
% spectral envelope
1;
%----------------------------------------------------------------------
%
% User defined search functions
%
%----------------------------------------------------------------------
%----------------------------------------------------------------------
% standard mean squared error search
function [idx contrib errors test_ g mg sl] = vq_search_mse(vq, data)
[nVec nCols] = size(vq);
nRows = rows(data);
error = zeros(1,nVec);
errors = zeros(1, nRows);
idx = zeros(1, nRows);
contrib = zeros(nRows, nCols);
test_ = zeros(nVec, nCols);
for f=1:nRows
target = data(f,:);
for i=1:nVec
diff = target - vq(i,:);
error(i) = diff * diff';
end
[mn min_ind] = min(error);
errors(f) = mn; idx(f) = min_ind; contrib(f,:) = vq(min_ind,:);
test_(f,:) = vq(min_ind,:);
end
g = 0; mg = 1; sl = 0; % dummys for this function
endfunction
%----------------------------------------------------------------------
% abs() search with a linear gain term
function [idx contrib errors test_ g mg sl] = vq_search_gain(vq, data)
[nVec nCols] = size(vq);
nRows = rows(data);
error = zeros(1,nVec);
g = zeros(nRows, nVec);
diff = zeros(nVec, nCols);
errors = zeros(1, nRows);
idx = zeros(1, nRows);
contrib = zeros(nRows, nCols);
test_ = zeros(nVec, nCols);
for f=1:nRows
target = data(f,:);
for i=1:nVec
% work out gain for best match
g(f, i) = (sum(target) - sum(vq(i,:)))/nCols;
diff(i,:) = target - vq(i,:) - g(f, i);
% abs in dB is MSE in linear
error(i) = mean(abs(diff(i,:)));
%printf("f: %d i: %d g: %f error: %f\n", f, i, g(f, i), error(i));
end
[mn min_ind] = min(error);
idx(f) = min_ind;
errors(f) = mn;
contrib(f,:) = test_(f,:) = vq(min_ind,:) + g(f,min_ind);
end
mg = 1; sl = 0;
endfunction
%----------------------------------------------------------------------
% abs() search with a linear plus ampl scaling term
function [idx contrib errors test_ g mg sl] = vq_search_mag(vq, data)
[nVec nCols] = size(vq);
nRows = rows(data);
g = mg = zeros(nRows, nVec);
diff = zeros(nVec, nCols);
errors = zeros(1, nRows);
idx = error = zeros(1, nVec);
contrib = zeros(nRows, nCols);
test_ = zeros(nVec, nCols);
weights = ones(1,nCols);
%weights(1) = 0;
for f=1:nRows
target = data(f,:);
%target = 2*vq(1,:)+1;
for i=1:nVec
% work out gain and amp scaling for best match
A = [sum(vq(i,:)) nCols; vq(i,:)*vq(i,:)' sum(vq(i,:))];
c = [sum(target) target*vq(i,:)']';
b = inv(A)*c;
g(f,i) = b(2); mg(f,i) = b(1);
diff(i,:) = target - (mg(f,i)*vq(i,:) + g(f,i));
diff(i,:) .* weights;
% abs in dB is MSE in linear
error(i) = mean(abs(diff(i,:)));
%printf("f: %d i: %d mg: %f g: %f error: %f\n", f, i, mg(f,i), g(f,i), error(i));
end
[mn min_ind] = min(error);
errors(f) = mn;
idx(f) = min_ind;
contrib(f,:) = test_(f,:) = mg(f,min_ind) * vq(min_ind,:) + g(f,min_ind);
end
sl = 0;
endfunction
%----------------------------------------------------------------------
% abs() search with a linear, ampl scaling, and slope term
function [idx contrib errors test_ g mg sl] = vq_search_slope(vq, data)
[nVec nCols] = size(vq);
nRows = rows(data);
g = mg = sl = zeros(nRows, nVec);
diff = zeros(nVec, nCols);
errors = zeros(1, nRows);
idx = error = zeros(1, nVec);
contrib = zeros(nRows, nCols);
test_ = zeros(nVec, nCols);
weights = ones(1,nCols);
for f=1:nRows
target = data(f,:);
%target = 2*vq(1,:)+1;
for i=1:nVec
% work out gain, amp and slope for best match, 3 unknowns, 3 equations
A = [sum(vq(i,:)) nCols sum(1:nCols); ...
vq(i,:)*vq(i,:)' sum(vq(i,:)) (1:nCols)*vq(i,:)'; ...
(1:nCols)*vq(i,:)' sum(1:nCols) (1:nCols)*(1:nCols)'];
c = [sum(target) target*vq(i,:)' target*(1:nCols)']';
b = inv(A)*c;
mg(f,i) = b(1); g(f,i) = b(2); sl(f,i) = b(3);
diff(i,:) = target - (mg(f,i)*vq(i,:) + g(f,i) + sl(f,i)*(1:nCols));
diff(i,:) .* weights;
% abs in dB is MSE in linear
error(i) = mean(abs(diff(i,:)));
%printf("f: %d i: %d mg: %f g: %f sl: %f error: %f\n", f, i, mg(f,i), g(f,i), sl(f,i), error(i));
end
[mn min_ind] = min(error);
errors(f) = mn;
idx(f) = min_ind;
contrib(f,:) = test_(f,:) = mg(f,min_ind)*vq(min_ind,:) + g(f,min_ind) + sl(f,min_ind)*(1:nCols);
end
endfunction
%----------------------------------------------------------------------
%
% Functions to support simulation of different VQ training and testing
%
%----------------------------------------------------------------------
% evaluate database test using vq, with selectable search function. Can be operated in
% GUI mode to analyse in fine detail or batch mode to evaluate lots of data.
function sd_per_frame = run_test(vq, test, nVec, search_func, gui_en = 0)
% Test VQ using test data -----------------------
% Each test must output:
% errors: - error for best vector
% idx...: - index of best vector
% test_.: - best approximation of target test database
[nRows nCols] = size(test);
[idx contrib errors test_ g mg sl] = feval(search_func, vq, test);
% sd over time
sd_per_frame = zeros(nRows,1);
for i=1:nRows
sd_per_frame(i) = mean(abs(test(i,:) - test_(i,:)));
end
printf("average sd: %3.2f\n", mean(sd_per_frame));
%printf("%18s nVec: %d SD: %3.2f dB\n", search_func, nVec, mean(sd_per_frame));
% Optional GUI with U to anlayse VQ perf ---------------------------------------------------
if gui_en
gui_mode = "sort"; % "sort" or "fbf"
sort_order = "worst"; % "worst" or "best"
f = 1;
% plots sd over time and histogram
figure(1); clf; subplot(211); plot(sd_per_frame); title('SD');
subplot(212); [yy xx] = hist(sd_per_frame); plot(xx,yy,'+-');
end
while gui_en
% display m frames, printing some stats, plotting vector to give visual idea of match
figure(2); clf;
% try this in 'ascend' or 'descend' mode to best or worst frames
if strcmp(gui_mode, "sort")
if strcmp(sort_order, "best")
[errors_dec frame_dec] = sort(errors, "ascend");
end
if strcmp(sort_order, "worst")
[errors_dec frame_dec] = sort(errors, "descend");
end
m = 4;
else
m = 1;
errors_dec = errors(f); frame_dec(1) = f;
end
for i=1:m
% build up VQ legend
af = frame_dec(i); aind = idx(af);
ag = 0; amg = 1; asl = 0;
if strcmp(search_func, "vq_search_gain")
ag = g(af,aind);
l3 = sprintf("g: %3.2f", ag);
end
if strcmp(search_func, "vq_search_mag")
ag = g(af,aind);
amg = mg(af,aind);
l3 = sprintf("g: %3.2f mg: %3.2f", ag, amg);
end
if strcmp(search_func, "vq_search_slope")
ag = g(af,aind);
amg = mg(af,aind);
asl = sl(af,aind);
l3 = sprintf("g: %3.2f mg: %3.2f sl: %3.2f", ag, amg, asl);
end
% plot target
subplot(sqrt(m), sqrt(m),i);
l1 = sprintf("b-;fr %d sd: %3.2f;", af, errors_dec(m));
plot(test(af,:), l1);
% plot vq vector and modified version
hold on;
l2 = sprintf("g-+;ind %d;", aind);
plot(vq(aind, :), l2);
l3 = sprintf("g-o;%s;",l3);
plot(amg*vq(aind, :) + ag + asl*(1:nCols), l3);
hold off;
axis([1 nCols -10 40]);
end
% interactive menu ------------------------------------------
if strcmp(gui_mode, "sort")
printf("\rmenu: m-mode[%s] o-order[%s] q-quit", gui_mode, sort_order);
end
if strcmp(gui_mode, "fbf")
printf("\rmenu: m-mode[%s] frame: %d n-next b-back q-quit", gui_mode, f);
end
fflush(stdout);
k = kbhit();
if k == 'm'
if strcmp(gui_mode, "sort")
gui_mode = "fbf"; f = frame_dec(1);
else
gui_mode = "sort";
end;
end
if k == 'o'
if strcmp(sort_order, "worst")
sort_order = "best";
else
sort_order = "worst";
end;
end
if k == 'n', f = f + 1; endif;
if k == 'b', f = f - 1; endif;
if k == 'q', gui_en = 0; printf("\n"); endif;
end
endfunction
function search_func = get_search_func(short_name)
if strcmp(short_name, "mse")
search_func = 'vq_search_mse';
end
if strcmp(short_name, "gain")
search_func = 'vq_search_gain';
end
if strcmp(short_name, "mag")
search_func = 'vq_search_mag';
end
if strcmp(short_name, "slope")
search_func = 'vq_search_slope';
end
end
%----------------------------------------------------------------------
% Train up a VQ and run one or mode tests
function [sd des] = train_vq_and_run_tests(sim_in)
Nvec = sim_in.Nvec;
train_func = get_search_func(sim_in.train_func_short);
printf(" Nvec %d kmeans start\n", Nvec);
[idx vq] = feval(sim_in.kmeans,
sim_in.trainvec, Nvec,
"start", "sample",
"emptyaction", "singleton",
"search_func", train_func);
printf(" Nvec %d kmeans end\n", Nvec);
sd = []; des = []; tests = sim_in.tests;
for t=1:length(cellstr(tests))
test_func_short = char(cellstr(tests)(t));
test_func = get_search_func(test_func_short);
asd = run_test(vq, sim_in.testvec, Nvec, test_func);
sd = [sd asd];
ades.Nvec = Nvec;
ades.train_func_short = sim_in.train_func_short;
ades.test_func_short = test_func_short;
ades_str = sprintf("Nvec: %3d train: %-5s test: %-5s",
ades.Nvec, ades.train_func_short, ades.test_func_short);
des = [des ades];
printf(" %s SD: %3.2f\n", ades_str, mean(asd));
end
endfunction
% Search for test that matchs train/test and any Nvec and constructs points for error bar plot
function [y x leg] = search_tests(sd, desc, train, test)
nTests = length(desc)
x = []; y = []; leg = [];
for i=1:nTests
if strcmp(desc(i).train_func_short, train) && strcmp(desc(i).test_func_short, test)
printf("i: %2d Nvec: %3d train: %5s test: %5s\n", i, desc(i).Nvec, desc(i).train_func_short, desc(i).test_func_short);
x = [x; log2(desc(i).Nvec)];
y = [y; mean(sd(:,i)) std(sd(:,i))];
end
end
endfunction
function plot_sd_results(fg, title_str, sd, desc, train_list, test_list)
figure(fg); clf;
nlines = 0; inc = -0.1;
for i=1:length(cellstr(train_list))
train_func_short = char(cellstr(train_list)(i));
for j=1:length(cellstr(test_list))
test_func_short = char(cellstr(test_list)(j));
[y x] = search_tests(sd, desc, train_func_short, test_func_short);
leg = sprintf("o-%d;train: %5s test: %5s;", nlines, train_func_short, test_func_short);
if nlines, hold on; end;
x += inc; inc += 0.1; % separate x coords a bit to make errors bars legible
errorbar(x, y(:,1), y(:,2), leg);
nlines++;
end
if nlines>1, hold off; end;
end
xlabel('VQ size (bits)')
ylabel('mean SD (dB)');
title(title_str);
endfunction
% Search for test that matches all fields for histogram plots
function testNum = search_tests_Nvec(sd, desc, train, test, Nvec)
nTests = length(desc);
testNum = 0;
for i=1:nTests
if strcmp(desc(i).train_func_short, train) && strcmp(desc(i).test_func_short, test) && desc(i).Nvec == Nvec
printf("i: %2d Nvec: %3d train: %5s test: %5s\n", i, desc(i).Nvec, desc(i).train_func_short, desc(i).test_func_short);
testNum = i;
end
end
endfunction
%----------------------------------------------------------------------
%
% Plot histograms of SDs for comparison. Each col of sd_per_frame
% has the results of one test, number of cols % is number of tests. leg
% is a col vector with one legend string for each test.
function compare_hist(fg, atitle, sd, desc)
figure(fg); clf;
[nRows nCols] = size(sd);
for c=1:nCols
[yy, xx] = hist(sd(:, c));
if c == 2, hold on; end;
leg = sprintf("o-%d;train: %5s test: %5s;", c-1, desc(c).train_func_short, desc(c).test_func_short);
plot(xx, yy, leg);
end
if nCols > 1, hold off; end;
title(atitle)
end
%----------------------------------------------------------------------
% Run a bunch of long tests in parallel and plot results
%
% This test tries a bunch of training and test search combinations on
% the first 1kHz to see if there is any advantage in using the higher
% order search techniques in the VQ training. So far it appears
% not.... However the higher order search routines do give great
% results compared to a basic MSE (or oder) search.
function [sd desc] = long_tests_1_10(quick_check=0)
num_cores = 4;
K = 10;
load surf_train_120_hpf150; load surf_all_hpf150;
if quick_check
NtrainVec = 1000;
NtestVec = 100;
else
NtrainVec = length(surf_train_120_hpf150);
NtestVec = length(surf_all_hpf150);
end
trainvec = surf_train_120_hpf150(1:NtrainVec,1:K);
testvec = surf_all_hpf150(1:NtestVec,1:K);
% build up a big array of tests to run ------------------------
sim_in.trainvec = trainvec; sim_in.testvec = testvec;
sim_in.kmeans = "kmeans2"; % slow but supports different search functions
sim_in.Nvec = 64;
sim_in.train_func_short = "mse";
sim_in.tests = ["mse"; "gain"; "mag"; "slope"];
% Test1: mse training, 64, 128, 256
sim_in_vec(1:3) = sim_in;
sim_in_vec(2).Nvec = 128; sim_in_vec(3).Nvec = 256;
test_list = sim_in_vec;
% Test2: gain training, 64, 128, 256
for i=1:3, sim_in_vec(i).train_func_short = "gain"; end;
test_list = [test_list sim_in_vec];
% Test3: mag training, 64, 128, 256
for i=1:3, sim_in_vec(i).train_func_short = "mag"; end;
test_list = [test_list sim_in_vec];
% Test4: slope training, 64, 128, 256
for i=1:3, sim_in_vec(i).train_func_short = "slope"; end;
test_list = [test_list sim_in_vec];
% run test list in parallel
[sd desc] = pararrayfun(num_cores, @train_vq_and_run_tests, test_list);
%train_vq_and_run_tests(test_list(1));
% Plot results -----------------------------------------------
fg = 1;
plot_sd_results(fg++, "MSE Training", sd, desc, "mse", ["mse"; "gain"; "mag"; "slope"]);
plot_sd_results(fg++, "Gain Training", sd, desc, "gain", ["mse"; "gain"; "mag"; "slope"]);
plot_sd_results(fg++, "Mag Training", sd, desc, "mag", ["mse"; "gain"; "mag"; "slope"]);
plot_sd_results(fg++, "Slope Training", sd, desc, "slope", ["mse"; "gain"; "mag"; "slope"]);
plot_sd_results(fg++, "Slope Searching", sd, desc, ["mse"; "gain"; "mag"; "slope"], "slope");
% histogram of results from Nvec=64, all training methods, slope search
testnum = search_tests_Nvec(sd, desc, "mse", "slope", 64);
hist_sd = sd(:,testnum); hist_desc = desc(testnum);
testnum = search_tests_Nvec(sd, desc, "gain", "slope", 64);
hist_sd = [hist_sd sd(:,testnum)]; hist_desc = [hist_desc desc(testnum)];
testnum = search_tests_Nvec(sd, desc, "mag", "slope", 64);
hist_sd = [hist_sd sd(:,testnum)]; hist_desc = [hist_desc desc(testnum)];
testnum = search_tests_Nvec(sd, desc, "slope", "slope", 64);
hist_sd = [hist_sd sd(:,testnum)]; hist_desc = [hist_desc desc(testnum)];
compare_hist(fg, "Histogram of SDs Nvec=64", hist_sd, hist_desc);
endfunction
%----------------------------------------------------------------------
% This test tries some test combinations on the 1000 to 3000 Hz range.
%
% Based on our results with long_test_1_10() we just use MSE training
function [sd desc] = long_tests_11_40(quick_check=0)
num_cores = 4;
K = 20; st = 11; en = 40;
load surf_train_120_hpf150; load surf_all_hpf150;
if quick_check
NtrainVec = 1000;
NtestVec = 100;
else
NtrainVec = length(surf_train_120_hpf150);
NtestVec = length(surf_all_hpf150);
end
trainvec = surf_train_120_hpf150(1:NtrainVec,st:en);
testvec = surf_all_hpf150(1:NtestVec,st:en);
% build up an array of tests to run ------------------------
sim_in.kmeans = "kmeans";
sim_in.trainvec = trainvec; sim_in.testvec = testvec;
sim_in.Nvec = 64;
sim_in.train_func_short = "mse";
sim_in.tests = ["slope"];
% Test1: mse training, 64, 128, 256, 512
sim_in_vec(1:4) = sim_in;
sim_in_vec(2).Nvec = 128; sim_in_vec(3).Nvec = 256; sim_in_vec(4).Nvec = 512;
test_list = sim_in_vec;
% run test list in parallel
[sd desc] = pararrayfun(num_cores, @train_vq_and_run_tests, test_list);
%[sd desc] = train_vq_and_run_tests(sim_in_vec(4));
% Plot results -----------------------------------------------
fg = 2;
plot_sd_results(fg++, "MSE Training 10..30", sd, desc, "mse", "slope");
#{
% histogram of results from Nvec=64, all training methods, slope search
testnum = search_tests_Nvec(sd, desc, "mse", "slope", 64);
hist_sd = sd(:,testnum); hist_desc = desc(testnum);
testnum = search_tests_Nvec(sd, desc, "gain", "slope", 64);
hist_sd = [hist_sd sd(:,testnum)]; hist_desc = [hist_desc desc(testnum)];
testnum = search_tests_Nvec(sd, desc, "mag", "slope", 64);
hist_sd = [hist_sd sd(:,testnum)]; hist_desc = [hist_desc desc(testnum)];
testnum = search_tests_Nvec(sd, desc, "slope", "slope", 64);
hist_sd = [hist_sd sd(:,testnum)]; hist_desc = [hist_desc desc(testnum)];
compare_hist(fg, "Histogram of SDs Nvec=64", hist_sd, hist_desc);
#}
endfunction
function vq = detailed_test(Nvec=64, st=1, en=10, quick_check=1, test_func = "vq_search_slope")
K = en-st+1;
load surf_train_120_hpf150;
load surf_all_hpf150;
if quick_check
NtrainVec = 1000;
NtestVec = 100;
else
NtrainVec = length(surf_train_120_hpf150);
NtestVec = length(surf_all_hpf150);
NtestVec = 100;
end
trainvec = surf_train_120_hpf150(1:NtrainVec,st:en);
testvec = surf_all_hpf150(1:NtestVec,st:en);
[idx vq] = kmeans(trainvec, Nvec,
"start", "sample",
"emptyaction", "singleton");
run_test(vq, testvec, Nvec, test_func, gui_en=1);
endfunction
% ------------------------------------------------------------------
%
% Following test_* functions have some contrived examples to test VQ
% training with each training method. However the longtest_1_10
% results suggest these training methds don't provide any improvement
% over standard MSE kmeans.
function test_training_mse
K = 3; NtrainVec = 10; Nvec = 2;
trainvec = ones(NtrainVec,K);
trainvec(2:2:NtrainVec,:) = -1;
[idx vq] = kmeans2(trainvec, Nvec,
"start", "first",
"emptyaction", "singleton",
"search_func", "vq_search_mse");
ok = find(vq == [1 1 1]) && (find(vq == [-1 -1 -1]));
printf("ok: %d\n", ok);
endfunction
function test_training_gain
K = 3; NtrainVec = 10; Nvec = 2;
% Vectors that are the same, but offset from each other via a linear
% term. Training algorithm should map these all to the "same"
% vector.
trainvec = ones(NtrainVec,K);
for v=2:NtrainVec/2
trainvec(v,:) += v*ones(1,1:K);
end
% Second set of "identical" vectors except for gain offset
for v=NtrainVec/2+1:NtrainVec
trainvec(v,:) = [1 0 -1] + (v-NtrainVec/2-1)*ones(1,1:K);
end
[idx vq] = kmeans2(trainvec, Nvec,
"start", "sample",
"emptyaction", "singleton",
"search_func", "vq_search_gain");
% check we get a vq table of two vectors that are linear offset [1 1 1] and [1 0 -1]
tol = 0.001; ok = 0;
for i=1:Nvec
diff = vq(i,:) - [1 1 1];
if std(diff) < tol, ok++; end;
diff = vq(i,:) - [1 0 -1];
if std(diff) < tol, ok++; end;
end
if ok == 2, printf("gain: OK\n"); end;
endfunction
function test_training_mag
K = 3; NtrainVec = 10; Nvec = 2;
% Given a vector x, create a set of training data y = m*x + c, with x
% modified by a magnitude and linear term. Each vector has a
% different mag and linear term.
trainvec = zeros(NtrainVec,K);
for v=1:2:NtrainVec
trainvec(v,:) = v*[1 2 3] + 2*v;
end
% another set of "identical" vectors, mapped by different magnitude and linear terms,
% alternated with the frist set so we can use the "start:first" to populate the VQ
for v=2:2:NtrainVec
trainvec(v,:) = cos(v)*[2 -1 2] -2*v;
end
trainvec
[idx vq] = kmeans2(trainvec, Nvec,
"start", "first",
"search_func", "vq_search_mag");
vq
% todo: how to auto test? Need to solve same euqations?
#}
tol = 0.001; ok = 0;
for i=1:Nvec
diff = vq(i,:) - [1 1 1];
if std(diff) < tol, ok++; end;
diff = vq(i,:) - [1 0 -1];
if std(diff) < tol, ok++; end;
end
if ok == 2, printf("gain: OK\n"); end;
#}
endfunction
function test_training_slope
K = 40; NtrainVec = 5; Nvec = 1;
% Given a vector x, create a set of training data y = m*x + c + sl, where:
% x is a vector
% c is a constant offset
% m is a magnitude multiplier on all elements of x
% sl is a linear slope vector
load surf_all;
prototype = surf_all(73,:);
trainvec = zeros(NtrainVec,K);
for v=1:NtrainVec
trainvec(v,:) = 2*prototype + 1 + v*(1:K);
end
figure(1); clf; plot(trainvec');
[idx vq] = kmeans2(trainvec, Nvec,
"start", "first",
"search_func", "vq_search_slope");
[idx contrib errors test_ g mg sl] = vq_search_slope(prototype, trainvec)
% todo: how to auto test? Need to solve same euqations?
#{
tol = 0.001; ok = 0;
for i=1:Nvec
diff = vq(i,:) - [1 1 1];
if std(diff) < tol, ok++; end;
diff = vq(i,:) - [1 0 -1];
if std(diff) < tol, ok++; end;
end
if ok == 2, printf("gain: OK\n"); end;
#}
endfunction
% ---------------------------------------------------------
format; more off;
rand('seed',1); % kmeans using rand for initial population,
% we want same results on every run
% choose one of these to run
detailed_test(64, 1, 20, quick_check=1);
%[sd desc] = long_tests_1_10(quick_check=1);
%[sd desc] = long_tests_11_40(quick_check=0);
%test_training_slope