TrainNetwork 4 dimension error

Question

0 votes

My project is Source separation using masking. Here is my code below where I get an error at trainNetwork as "Invalid validation data. X must be a 4-D array of images.", even after converting the data into 4D array. Could someone help me?

clc;clear all;close all;
%Training data
[y,fs] =audioread('13069 tri.wav');
[z,fs] =audioread('7040 sine.wav');
ny=length(y);
  nz=length(z);
  N=min([ny nz]);
  s1=y(1:N);
  s2=z(1:N); 
left = s1(:,1);
right = s1(:,1);
time = (1/fs)*length(s1);
t = linspace(0, time, length(left));
plot(t,left, t, right);
xlabel('time(sec)');
ylabel('signal strength');
% sound(left, fs);     
% sound(right, fs);    
% sound(d, fs);       
% l= left(10:15);
%  r=right(10:15);
% source1= s1(10:15,:);
 left = s2(:,1);
right = s2(:,1);
time = (1/fs)*length(s2);
t = linspace(0, time, length(left));
plot(t,left, t, right);
xlabel('time(sec)');
ylabel('signal strength');
% sound(left, fs);     
% sound(right, fs);    
% sound(d, fs);       
% l= left(10:15);
%  r=right(10:15);
% source2= s2(11:15,:);
S1=reshape(s1,[],4);
S2=reshape(s2,[],4);
%Testing data
[y1,fs] =audioread('13069 tri.wav');
[z1,fs] =audioread('7040 sine.wav');
ny1=length(y1);
  nz1=length(z1);
  N=min([ny1 nz1]);
  ss1=y1(1:N);
  ss2=z1(1:N); 
left = ss1(:,1);
right = ss1(:,1);
time = (1/fs)*length(ss1);
t = linspace(0, time, length(left));
plot(t,left, t, right);
xlabel('time(sec)');
ylabel('signal strength');
% sound(left, fs);     
% sound(right, fs);    
% sound(d, fs);       
% l= left(10:15);
%  r=right(10:15);
%ssource1= ss1(10:15,:);
 left = ss2(:,1);
right = ss2(:,1);
time = (1/fs)*length(ss2);
t = linspace(0, time, length(left));
plot(t,left, t, right);
xlabel('time(sec)');
ylabel('signal strength');
% sound(left, fs);     
% sound(right, fs);    
% sound(d, fs);       
% l= left(10:15);
%  r=right(10:15);
%ssource2= ss2(11:15,:);
S11=reshape(s1,[],4);
S22=reshape(s1,[],4);
%Mix
mixTrain = S1 + S2;
mixTrain = mixTrain / max(mixTrain);
mixValidate = S11 + S22;
mixValidate = mixValidate / max(mixValidate);
%cwt train
P_mix0 = cwt(mixTrain);
P_M    = abs(cwt(s1));
P_F    = abs(cwt(s2));
%test cwt
P_Val_mix0 = cwt(mixValidate);
P_Val_M    = abs(cwt(ss1));
P_Val_F    = abs(cwt(ss2));
maskTrain = P_M ./ (P_M + P_F + eps);
%Compute the validation soft mask. Use this mask to evaluate the mask emitted by the trained network.
maskValidate = P_Val_M ./ (P_Val_M + P_Val_F + eps);
X=reshape(P_mix0,[],4);
Y=reshape(maskTrain,[],4);
 P_Val_mix=reshape(P_Val_mix0,[],4);
 maskValidate1=reshape(maskValidate,[],4);
 [X] = digitTrain4DArrayData;
 [Y] = digitTrain4DArrayData;
 P_Val_mix=digitTrain4DArrayData;
 maskValidate1=digitTrain4DArrayData;
 
 
 idx = randperm(size(X,4),1000);
 X1= X(:,:,:,idx);
X(:,:,:,idx) = [];
idx1 = randperm(size(Y,4),100);
 Y1= Y(:,:,:,idx1);
Y(:,:,:,idx1) = [];
newY=zeros(size(X));
 idx2 = randperm(size(P_Val_mix,4),1000);
 P_Val_mix1= P_Val_mix(:,:,:,idx2);
P_Val_mix(:,:,:,idx2) = [];
idx3 = randperm(size(maskValidate1,4),100);
 maskValidate11= maskValidate1(:,:,:,idx3);
maskValidate1(:,:,:,idx3) = [];
newY=zeros(size(X));
 
% layers =[
%     imageInputLayer([28 28 1])
%     convolution2dLayer([3,1],1,'Padding','same')
%     reluLayer
%     convolution2dLayer([3,1],1,'Padding','same')
%     reluLayer
%     convolution2dLayer([3,1],1,'Padding','same')
%     reluLayer
%     convolution2dLayer([3,1],1,'Padding','same')
%     reluLayer
%     regressionLayer
%     ];
layers = [ ...
    
    imageInputLayer([28 28 1],"Normalization","None")
   
   layers =[
     imageInputLayer([28 28 1])
     convolutionLayer([3,1],1,'Padding','same')
     reluLayer
     convolutionLayer([3,1],1,'Padding','same')
     reluLayer
   convolutionLayer([3,1],1,'Padding','same')
     reluLayer
     convolutionLayer([3,1],1,'Padding','same')
     reluLayer
    regressionLayer
    
    ];
maxEpochs     = 3;
miniBatchSize = 28;
options = trainingOptions("adam", ...
    "MaxEpochs",maxEpochs, ...
    "MiniBatchSize",miniBatchSize, ...
    "SequenceLength","longest", ...
    "Shuffle","every-epoch",...
    "Verbose",0, ...
    "Plots","training-progress",...
     "ValidationFrequency",30,...
    "ValidationData",{P_Val_mix0,maskValidate},...
    "LearnRateSchedule","piecewise",...
    "LearnRateDropFactor",0.9, ...
    "LearnRateDropPeriod",1);
%Do training
    CocktailPartyNet = trainNetwork(X,newY,layers,options);
%CocktailPartyNet = trainNetwork(mixSequencesT,maskSequencesT,layers,options);
estimatedMasks0 = predict(CocktailPartyNet,P_Val_mix);
estimatedMasks0 = estimatedMasks0.';
Softs1Mask   = estimatedMasks0; 
Softs2Mask = 1 - Softs1Mask;
P_Val_mix0 = P_Val_mix0(:,1:size(Softs1Mask,2));
P_s1 = P_Val_mix0 .* Softs1Mask;
P_s1 = [conj(P_s1(end-1:-1:2,:)) ; P_s1];
source1_est_soft = icwt(P_s1);
source1_est_soft = source1_est_soft / max(abs(source1_est_soft));
% range = (numel(win):numel(maleSpeech_est_soft)-numel(win));
t     = (1/fs);
figure(9)
subplot(2,1,1)
plot(t,S11)
title("Original Source1")
xlabel("Time (s)")
grid on
subplot(2,1,2)
plot(t,source1_est_soft)
xlabel("Time (s)")
title("Estimated source 1 (Soft Mask)")
grid on
% sound(maleSpeech_est_soft(range),Fs)
%Multiply the mix STFT by the female soft mask to get the estimated female speech STFT. Use the ISTFT to get the estimated male audio signal. Scale the audio.
P_s2 = P_Val_mix0 .* Softs2Mask;
P_s2 = [conj(P_s2(end-1:-1:2,:)) ; P_s2];
source2_est_soft = icwt(P_s2);
source2_est_soft = source2_est_soft / max(source2_est_soft);
%Visualize the estimated and original female signals. Listen to the estimated female speech.
figure(10)
subplot(2,1,1)
plot(t,S22)
title("Original Source 2")
grid on
subplot(2,1,2)
plot(t,source2_est_soft)
xlabel("Time (s)")
title("Estimated source 2 (Soft Mask)")
grid on