-
Notifications
You must be signed in to change notification settings - Fork 47
Expand file tree
/
Copy pathconsistentILRMA.m
More file actions
255 lines (237 loc) · 11.3 KB
/
consistentILRMA.m
File metadata and controls
255 lines (237 loc) · 11.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
function [estSig, cost] = consistentILRMA(mixSig, nSrc, sampFreq, nBases, fftSize, shiftSize, windowType, nIter, refMic, applyWhitening, drawConv)
% Blind source separation with independent low-rank matrix analysis (ILRMA)
% using spectrogram consistency
%
% Coded by D. Kitamura (d-kitamura@ieee.org)
%
% Copyright 2020 Daichi Kitamura
%
% These programs are distributed only for academic research at
% universities and research institutions.
% It is not allowed to use or modify these programs for commercial or
% industrial purpose without our permission.
% When you use or modify these programs and write research articles,
% cite the following references:
%
% # Original paper
% D. Kitamura and K. Yatabe, "Consistent independent low-rank matrix
% analysis for determined blind source separation," EURASIP J. Adv. Signal
% Process., vol. 2020, no. 46, p. 35, November 2020.
%
% See also:
% http://d-kitamura.net
% http://d-kitamura.net/demo-ILRMA_en.html
%
% [syntax]
% [estSig,cost] = consistentILRMA(mixSig,nSrc,sampFreq,nBases,fftSize,shiftSize,windowType,nIter,refMic,applyWhitening,drawConv)
%
% [inputs]
% mixSig: observed mixture (sigLen x nCh)
% nSrc: number of sources in the mixture (scalar)
% sampFreq: sampling frequency [Hz] of mixSig (scalar)
% nBases: number of bases for each source in NMF model (scalar, default: 4)
% fftSize: window length [points] in STFT (scalar, default: next higher power of 2 that exceeds 0.256*sampFreq)
% shiftSize: shift length [points] in STFT (scalar, default: fftSize/2)
% windowType: window function used in STFT (name of window function, default: 'hamming')
% nIter: number of iterations in the parameter update in ILRMA (scalar, default: 100)
% refMic: reference microphone for applying back projection (default: 1)
% applyWhitening: apply whitening to the observed multichannel spectrograms or not (true or false, default: false)
% drawConv: plot cost function values in each iteration or not (true or false, default: false)
%
% [outputs]
% estSig: estimated signals (sigLen x nCh x nSrc)
% cost: convergence behavior of cost function in ILRMA (nIter+1 x 1)
%
% Arguments check and set default values
arguments
mixSig (:,:) double
nSrc (1,1) double {mustBeInteger(nSrc)}
sampFreq (1,1) double
nBases (1,1) double {mustBeInteger(nBases)} = 4
fftSize (1,1) double {mustBeInteger(fftSize)} = 2^nextpow2(0.256*sampFreq)
shiftSize (1,1) double {mustBeInteger(shiftSize)} = fftSize/2
windowType char {mustBeMember(windowType,{'hamming','hann','rectangular','blackman','sine'})} = 'hamming'
nIter (1,1) double {mustBeInteger(nIter)} = 100
refMic (1,1) double {mustBeInteger(refMic)} = 1
applyWhitening (1,1) logical = false
drawConv (1,1) logical = false
end
% Error check
[sigLen, nCh] = size(mixSig); % sigLen: signal length, nCh: number of channels
if sigLen < nCh; error("The size of mixSig might be wrong.\n"); end
if nCh < nSrc || nSrc < 2; error("The number of channels must be equal to or grater than the number of sources in the mixture.\n"); end
if sampFreq <= 0; error("The sampling frequency (sampFreq) must be a positive value.\n"); end
if nBases < 1; error("The number of bases (nBases) must be a positive integer value.\n"); end
if fftSize < 1; error("The FFT length in STFT (fftSize) must be a positive integer value.\n"); end
if shiftSize < 1; error("The shift length in STFT (shiftSize) must be a positive integer value.\n"); end
if nIter < 1; error("The number of iterations (nIter) must be a positive integer value.\n"); end
if refMic < 1 || refMic > nCh; error("The reference microphone must be an integer between 1 and nCh.\n"); end
% Apply multichannel short-time Fourier transform (STFT)
[mixSpecgram, windowInStft] = STFT(mixSig, fftSize, shiftSize, windowType);
% Apply whitening (decorrelate X so that the correlation matrix becomes an identity matrix) based on principal component analysis
if applyWhitening
inputMixSpecgram = local_whitening(mixSpecgram, nSrc); % apply whitening, where dimension is reduced from nCh to nSrc when nSrc < nCh
else
inputMixSpecgram = mixSpecgram(:,:,1:nSrc); % when nSrc < nCh, only mixSpecgram(:,:,1:nSrc) is input to ILRMA so that the number of microphones equals to the number of sources (determined condition)
end
% Apply ILRMA
[estSpecgram, cost] = local_consistentILRMA(inputMixSpecgram, nIter, nBases, fftSize, shiftSize, windowInStft, sigLen, drawConv, mixSpecgram(:,:,refMic));
% Apply back projection (fix the scale ambiguity using the reference microphone channel)
scaleFixedSepSpecgram = local_backProjectionInit(estSpecgram, mixSpecgram(:,:,refMic)); % scale-fixed estimated signal
% Inverse STFT for each source
estSig = ISTFT(scaleFixedSepSpecgram, shiftSize, windowInStft, sigLen);
end
%% Local function for consistent ILRMA
function [Y, cost] = local_consistentILRMA(X, nIter, L, fftSize, shiftSize, analyWindow, sigLen, drawConv, refMixSpecgram)
% [inputs]
% X: observed multichannel spectrograms (I x J x M)
% nIter: number of iterations of the parameter updates
% L: number of bases in NMF model for each source
% fftSize: window length [points] in STFT (scalar)
% shiftSize: shift length [points] in STFT (scalar)
% analyWindow: window function used in STFT (fftSize x 1)
% sigLen: length of original signal (scalar)
% drawConv: plot cost function values in each iteration or not (true or false)
% refMixSpecgram: observed reference spectrograms before apply whitening (I x J)
%
% [outputs]
% Y: estimated spectrograms of sources (I x J x N)
% cost: convergence behavior of cost function in ILRMA (nIter+1 x 1)
%
% [scalars]
% I: number of frequency bins,
% J: number of time frames
% M: number of channels (microphones)
% N: number of sources (equals to M)
% L: number of bases in NMF model for each source
%
% [matrices]
% X: observed multichannel spectrograms (I x J x M)
% pX: permuted observed multichannel spectrograms (M x J x I)
% W: frequency-wise demixing matrix (N x M x I)
% Y: estimated multisource spectrograms (I x J x N)
% P: estimated multisource power spectrograms (I x J x N)
% T: sourcewise basis matrix in NMF (I x L x N)
% V: sourcewise activation matrix in NMF (L x J x N)
% R: sourcewise low-rank model spectrogram constructed by T and V (I x J x N)
% E: identity matrix (N x N)
% U: model-spectrogram-weighted sample covariance matrix of the mixture (M x M)
%
% Initialization
[I,J,M] = size(X); % I:frequency bins, J: time frames, M: channels
pX = permute(X, [3,2,1]); % permuted X whose dimensions are M x J x I
N = M; % N: number of sources, which equals to M in ILRMA
W = zeros(N,M,I); % frequency-wise demixing matrix
Y = zeros(I,J,N); % estimated spectrograms of sources (Y(i,:,n) = W(n,:,i)*pX(:,:,i))
for i = 1:I
W(:,:,i) = eye(N); % initial demixing matrices are set to identity matrices
Y(i,:,:) = (W(:,:,i)*pX(:,:,i)).'; % initial estimated spectrograms
end
P = max(abs(Y).^2, eps); % power spectrogram of Y
T = max(rand( I, L, N ), eps); % sourcewise basis matrix in NMF
V = max(rand( L, J, N ), eps); % sourcewise activation matrix in NMF
R = zeros(I,J,N); % sourcewise low-rank model spectrogram constructed by T and V (R(:,:,n) = T(:,:,n)*V(:,:,n))
for n = 1:N
R(:,:,n) = T(:,:,n)*V(:,:,n); % initial source model defined by T and V
end
E = eye(N); % identity matrix for e_n
cost = zeros(nIter+1, 1);
% Calculate initial cost function value
if drawConv
cost(1,1) = local_calcCostFunction( P, R, W, I, J );
end
% Optimize parameters in ILRMA (W, T, and V)
fprintf('Iteration: ');
for iIter = 1:nIter
fprintf('\b\b\b\b%4d', iIter);
%%%%% Update parameters %%%%%
for n = 1:N
%%%%% Update rule of T %%%%%
T(:,:,n) = T(:,:,n) .* sqrt((P(:,:,n)./(R(:,:,n).^2))*V(:,:,n).' ./ ( (1./R(:,:,n))*V(:,:,n).' ));
T(:,:,n) = max(T(:,:,n), eps);
R(:,:,n) = T(:,:,n)*V(:,:,n);
%%%%% Update rule of V %%%%%
V(:,:,n) = V(:,:,n) .* sqrt(T(:,:,n).'*(P(:,:,n)./(R(:,:,n).^2)) ./ ( T(:,:,n).'*(1./R(:,:,n)) ));
V(:,:,n) = max(V(:,:,n), eps);
R(:,:,n) = T(:,:,n)*V(:,:,n);
%%%%% Update rule of W %%%%%
for i = 1:I
U = (1/J)*(pX(:,:,i).*(1./R(i,:,n)))*pX(:,:,i)'; % U: M x M matrix (use implicit expansion)
w = (W(:,:,i)*U)\E(:,n); % w: M x 1 vector
w = w/sqrt(w'*U*w); % w: M x 1 vector
W(n,:,i) = w'; % w': 1 x M vector
end
end
for i = 1:I
Y(i,:,:) = (W(:,:,i)*pX(:,:,i)).'; % temporal estimated spectrograms of sources
end
%%%%% Back projection %%%%%
lambda = local_backProjection(Y, refMixSpecgram, I, N); % N x 1 x I
W = W.*lambda; % N x M x I (use implicit expansion)
Y = Y.*permute(lambda, [3,2,1]); % I x J x N (use implicit expansion)
lambdaPow = permute(abs(lambda).^2, [3,2,1]); % I x 1 x N
R = R.*lambdaPow; % I x J x N (use implicit expansion)
T = T.*lambdaPow; % I x L x N (use implicit expansion)
%%%%% Ensure spectrogram consistency %%%%%
Y = STFT(ISTFT(Y, shiftSize, analyWindow, sigLen), fftSize, shiftSize, analyWindow); % inverse STFT and STFT
P = max(abs(Y).^2,eps); % recompute power spectrogram
%%%%% Calculate cost function value %%%%%
if drawConv
cost(iIter+1,1) = local_calcCostFunction( P, R, W, I, J );
end
end
% Draw convergence behavior
if drawConv
figure; plot((0:nIter), cost);
set(gca, 'FontName', 'Times', 'FontSize', 16);
xlabel('Number of iterations', 'FontName', 'Arial', 'FontSize', 16);
ylabel('Value of cost function', 'FontName', 'Arial', 'FontSize', 16);
end
fprintf(' Consistent ILRMA done.\n');
end
%% Local function for calculating cost function value in ILRMA
function cost = local_calcCostFunction(P, R, W, I, J)
logDetAbsW = zeros(I, 1);
for i = 1:I
logDetAbsW(i, 1) = log(max(abs(det(W(:, :, i))), eps));
end
cost = sum(P./R+log(R), "all") - 2*J*sum(logDetAbsW, 1);
end
%% Local function for applying initial back projection
function Z = local_backProjectionInit(Y, X)
[I, J, M] = size(Y); % frequency bin x time frame x source
if size(X, 3) == 1 % calculate scale-fixed estimated signals using X(:,:,1)
A = zeros(1, M, I);
Z = zeros(I, J, M);
for i=1:I
Yi = squeeze(Y(i, :, :)).'; % channels x frames (M x J)
A(1, :, i) = X(i, :, 1)*Yi'/(Yi*Yi');
end
A(isnan(A) | isinf(A)) = 0; % replace NaN and Inf to 0
for m=1:M
for i=1:I
Z(i, :, m) = A(1, m, i)*Y(i, :, m);
end
end
elseif size(X, 3) == M % calculate scale-fixed source images of estimated signals
A = zeros(M, M, I);
Z = zeros(I, J, M, M); % frequency bin x time frame x source x channel
for i=1:I
for m=1:M
Yi = squeeze(Y(i, :, :)).'; % channels x frames (M x J)
A(m, :, i) = X(i, :, m)*Yi'/(Yi*Yi');
end
end
A(isnan(A) | isinf(A)) = 0; % replace NaN and Inf to 0
for n=1:M
for m=1:M
for i=1:I
Z(i, :, n, m) = A(m, n, i)*Y(i, :, n);
end
end
end
else
error("The number of channels in X must be 1 or equal to that in Y for back projection.\n");
end
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% EOF %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%