【语音识别】基于MFCC和gmm特征实现语音识别含GUI

241 阅读7分钟

 1 模型

采用能够反映人对语音的感知特性的Mel频率倒谱系数(MFCC)作为特征参数,以及为避免时间规整问题采用矢量量化技术开发的说话人识别系统.MFCC主要的是模拟人耳的听觉过程,相对于其它参数它对语音波形的变化不敏感,更加稳定,系统取得很好的识别结果,实验表明系统训练和识别的计算量和存储量都比较低.

img

img

2 部分代码

function varargout = Main(varargin)
% MAIN M-file for Main.fig
%     MAIN, by itself, creates a new MAIN or raises the existing
%     singleton*.
%
%     H = MAIN returns the handle to a new MAIN or the handle to
%     the existing singleton*.
%
%     MAIN('CALLBACK',hObject,eventData,handles,...) calls the local
%     function named CALLBACK in MAIN.M with the given input arguments.
%
%     MAIN('Property','Value',...) creates a new MAIN or raises the
%     existing singleton*. Starting from the left, property value pairs are
%     applied to the GUI before Main_OpeningFcn gets called. An
%     unrecognized property name or invalid value makes property application
%     stop. All inputs are passed to Main_OpeningFcn via varargin.
%
%     *See GUI Options on GUIDE's Tools menu. Choose "GUI allows only one
%     instance to run (singleton)".
%
% See also: GUIDE, GUIDATA, GUIHANDLES

% Edit the above text to modify the response to help Main

% Last Modified by GUIDE v2.5 11-Aug-2016 00:35:18

% Begin initialization code - DO NOT EDIT
gui_Singleton = 1;
gui_State = struct('gui_Name',       mfilename, ...
                 'gui_Singleton',  gui_Singleton, ...
                 'gui_OpeningFcn', @Main_OpeningFcn, ...
                 'gui_OutputFcn',  @Main_OutputFcn, ...
                 'gui_LayoutFcn', [] , ...
                 'gui_Callback',   []);
if nargin && ischar(varargin{1})
  gui_State.gui_Callback = str2func(varargin{1});
end

if nargout
[varargout{1:nargout}] = gui_mainfcn(gui_State, varargin{:});
else
  gui_mainfcn(gui_State, varargin{:});
end
% End initialization code - DO NOT EDIT


% --- Executes just before Main is made visible.
function Main_OpeningFcn(hObject, eventdata, handles, varargin)
% This function has no output args, see OutputFcn.
% hObject   handle to figure
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)
% varargin   command line arguments to Main (see VARARGIN)

% Choose default command line output for Main
handles.output = hObject;

% Update handles structure
guidata(hObject, handles);

% UIWAIT makes Main wait for user response (see UIRESUME)
% uiwait(handles.figure1);
load TrainingSet;
load TrainingLable;
[totalSampl,q]=size(TrainingSet);
str=num2str(tabulate(TrainingLable));

set(handles.totalrecords,'String',strcat(str));
set(handles.resultText,'String',strcat('Total Samples: ',num2str(totalSampl)));


% --- Outputs from this function are returned to the command line.
function varargout = Main_OutputFcn(hObject, eventdata, handles) 
% varargout cell array for returning output args (see VARARGOUT);
% hObject   handle to figure
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)

% Get default command line output from handles structure
varargout{1} = handles.output;



% --- Executes on button press in trainBtn.
function trainBtn_Callback(hObject, eventdata, handles)
% hObject   handle to trainBtn (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)
clc;
% clear all;
% close all;
set(handles.statusText,'String','Start Speaking...');
pause(0.001);
Fs = 8000% Sampling Freq (Hz)
%%Duration = 2; % Duration (sec)
%%audio_rec_obj = audiorecorder(Fs, 16, 1);
% get(audio_rec_obj);
% Record your voice for Duration seconds.
myRecording = wavrecord(2*Fs,Fs);
%%recordblocking(audio_rec_obj, Duration);
% disp('End of Recording.');
set(handles.statusText,'String','Saving....');
pause(0.001);
% Play back the recording.
%%play(audio_rec_obj);
% Store data in double-precision array.
%%myRecording = getaudiodata(audio_rec_obj);

% Plot the waveform.
% figure,
%plot(myRecording);
%grid on;
% title('Input Signal');
%xlabel('Samples');
%ylabel('Magnitude(db)');

%pre-empasis or high pass filter
Prem=0.97;
Filtered_output=filter([1,-Prem],1,myRecording);
%sound(Filtered_output);
wavwrite(Filtered_output, Fs, 16,'RAW');
wavplay(Filtered_output,Fs);
% figure,
%plot(Filtered_output);
%grid on;
% title('Pre-empasis Signal/Filtered Signal');
%xlabel('Samples');
%ylabel('Magnitude(db)');
len=length(Filtered_output);

Frame_size = Fs*32/1000%200 (sample points)
Frame_overlap = Fs*16/1000%120 (sample points)
Frame_step = Frame_size-Frame_overlap; % 80 (sample points)
Frame_rate = round(Fs/Frame_step)+1%100; frames/sec
Fft_size=Frame_size;

numFrames=length(Filtered_output)/Frame_step;

%padd the zeros for equal frame length
for i=1:numFrames*Frame_size
paddesSignal(i,:)=0;    
end

%get orignal signal
for n=1:len
paddesSignal(n,:) = Filtered_output(n,:);
end
      
%frame blocking or farming
for i=1:numFrames
  for n=1:Frame_size
      fdata(i,n)=paddesSignal(i*Frame_step+n,:);
  end
end

%% (2) Windowing..
  frameSize = size(fdata); 
  nbFrames = frameSize(1); 
  nbSamples = frameSize(2); 

  % Hamming window.. 
  w = hamming(nbSamples); 
  afterWindow = zeros(nbFrames,nbSamples);
  for i = 1:nbFrames
      singleFrame = fdata(i,1:nbSamples); 
      afterWindow(i1:nbSamples) = w'.*singleFrame; 
  end
%     figure,
%plot(afterWindow);
%grid on;
%xlabel('Samples');
%ylabel('Magnitude(db)');
%     title('Windowing graph');
  
  

%ylabel('Magnitude(db)');
%     title('mfcc normalized freq graph');
%       disp('done feature extraction ');
set(handles.statusText,'String','Input Saved in .wav file format');
pause(0.001);
%     %get size of train variable
  %%%try
      %%%load TrainingSet;
      %%%load TrainingLable;
  %%%catch er
      %%%TrainingSet=[];
      %%%TrainingLable=[];
      %%%disp('created new training');
  %%%end
  
  %%%[featuresCnt,Samples]=size(TrainingSet);
  %%%TrainingSet(featuresCnt+1,:)=meanMFCC;           %craete training matrix
  
  %create lables for features from user input
  inputLable=input('Press any key ''s');
  
  %%%TrainingLable(featuresCnt+1)=str2num(inputLable);
  disp('Select saved input through "Train with Audio" for Feature Extraction');
  %store training and labels in .mat files for classifier training
  %%%try
  %%%save('TrainingSet','TrainingSet');
  %%%save('TrainingLable','TrainingLable');
  %%%set(handles.statusText,'String','Done with Training and Saved');
  %%%pause(0.001);
  %%%catch ers
      %%%disp('Unable to save training set try again');
  %%%end
  
 
  


% --- Executes on button press in testBtn.
function testBtn_Callback(hObject, eventdata, handles)
% hObject   handle to testBtn (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)
clc;
set(handles.statusText,'String','Start Speaking...');
Fs = 8000% Sampling Freq (Hz)
%%Duration = 2; % Duration (sec)
%%audio_rec_obj = audiorecorder(Fs, 16, 1);
% get(audio_rec_obj);
% Record your voice for Duration seconds.
% disp('Start speaking.')
myRecording =audiorecorder(2*Fs,Fs);
pause(0.01);
set(handles.outputText,'String','--');
%%recordblocking(audio_rec_obj, Duration);
% disp('End of Recording.');
set(handles.statusText,'String','Stop Speaking');
pause(0.001);
% Play back the recording.
%%play(audio_rec_obj);
% Store data in double-precision array.
%%myRecording = getaudiodata(audio_rec_obj);

% Plot the waveform.
% figure,
axes(handles.axes1);
plot(myRecording);
grid on;
%title('Input Signal');
xlabel('Samples');
ylabel('Magnitude(db)');
set(handles.statusText,'String','Done with Recording...');
pause(0.001);
%pre-empasis or high pass filter
Prem=0.97;
Filtered_output=filter([1,-Prem],1,myRecording);
sound(Filtered_output);
% figure,
axes(handles.axes2);
plot(Filtered_output);
grid on;
%title('Pre-empasis Signal/Filtered Signal');
xlabel('Samples');
ylabel('Magnitude(db)');
len=length(Filtered_output);

Frame_size = Fs*32/1000%200 (sample points)
Frame_overlap = Fs*16/1000%120 (sample points)
Frame_step = Frame_size-Frame_overlap; % 80 (sample points)
Frame_rate = round(Fs/Frame_step)+1%100; frames/sec
Fft_size=Frame_size;

numFrames=length(Filtered_output)/Frame_step;

%padd the zeros for equal frame length
for i=1:numFrames*Frame_size
paddesSignal(i,:)=0;    
end

%get orignal signal
for n=1:len
paddesSignal(n,:) = Filtered_output(n,:);
end
      
%frame blocking or farming
for i=1:numFrames
  for n=1:Frame_size
      fdata(i,n)=paddesSignal(i*Frame_step+n,:);
  end
end

%% (2) Windowing..
  frameSize = size(fdata); 
  nbFrames = frameSize(1); 
  nbSamples = frameSize(2); 

  % Hamming window.. 
  w = hamming(nbSamples); 
  afterWindow = zeros(nbFrames,nbSamples);
  for i = 1:nbFrames
      singleFrame = fdata(i,1:nbSamples); 
      afterWindow(i1:nbSamples) = w'.*singleFrame; 
  end
%     figure,
  axes(handles.axes3);
  plot(afterWindow);
  grid on;
  xlabel('Samples');
  ylabel('Magnitude(db)');
  %title('Windowing graph');
  
  
        Tw = 25;           % analysis frame duration (ms)
        Ts = 10;           % analysis frame shift (ms)
        alpha = 0.97;      % preemphasis coefficient
        R = [ 300 3700 ];  % frequency range to consider
        M = 20;            % number of filterbank channels 
        N = 13;            % number of cepstral coefficients
        L = 22;    
        nfft = 2^nextpow2( nbFrames );     % length of FFT analysis 
        K = nfft/2+1;                      % length of the unique part of the FFT 
  
   %% HANDY INLINE FUNCTION HANDLES

  % Forward and backward mel frequency warping.
  % Note that base 10 is used in [1], while base e is used here and in HTK code
  hz2mel = @( hz )( 1127*log(1+hz/700) );     % Hertz to mel warping function
  mel2hz = @( mel )( 700*exp(mel/1127)-700 ); % mel to Hertz warping function

  % Type III DCT matrix routine 
  dctm = @( N, M )( sqrt(2.0/M) * cosrepmat([0:N-1].',1,M).* repmat(pi*([1:M]-0.5)/M,N,1) ) );

  % Cepstral lifter routine 
  ceplifter = @( N, L )( 1+0.5*L*sin(pi*[0:N-1]/L) );
  
  
  MAG = abs( fft(afterWindow,nfft,1) ); 
%     figure,
%     plot(MAG);
%     title('fft magnitude garaph');
  % Triangular filterbank with uniformly spaced filters on mel scale
  H = trifbank( M, K, R, Fs, hz2mel, mel2hz ); % size of H is M x K 

  % Filterbank application to unique part of the magnitude spectrum
  FBE = H * MAG(1:K,:); % FBE( FBE<1.0 ) = 1.0; % apply mel floor
  
  % DCT matrix computation
  temp = dctm( N, M );

  % Conversion of logFBEs to cepstral coefficients through DCT
  CC =  temp * log( FBE );
  
  % Cepstral lifter computation
  lifter = ceplifter( N, L );

  % Cepstral liftering gives liftered cepstral coefficients
  CC = diag( lifter ) * CC; % ~ HTK's MFCCs

  %%%%%%%%%%%%%%%%%%%% training %%%%%%%%%%%%%%%%%%%
  %to train the classifier normalize the values by taking the mean of CC;
  meanMFCC=mean(CC);                  %mean of CC 1xN
%     plot(CC)
%     figure,
 axes(handles.axes4);
  plot(meanMFCC);
  grid on;
  %title('mfcc normalized freq graph');
  xlabel('Samples');
  ylabel('Magnitude(db)');
  set(handles.statusText,'String','Done');
%       disp('done feature extraction ');
%     %get size of train variable
%     try
%         load TrainingSet;
%         load TrainingLable;
%     catch er
%         TrainingSet=[];
%         TrainingLable=[];
%         disp('created new training');
%     end
%     
%     [featuresCnt,Samples]=size(TrainingSet);
%     TrainingSet(featuresCnt+1,:)=meanMFCC;           %craete training matrix
%     
%     %create lables for features from user input
%     inputLable=input('Type the language lable (e.g. 1 for Marathi, 0 for English): ', 's');
%     
%     TrainingLable(featuresCnt+1)=str2num(inputLable);
%     disp('done feature extraction');
%     %store training and labels in .mat files for classifier training
%     try
%     save('TrainingSet','TrainingSet');
%     save('TrainingLable','TrainingLable');
%     catch ers
%         disp('Unable to save training set try again');
%     end
  
  clc;
  testData=meanMFCC;
  %call svm training function
  load Traininglable;
  load TrainingSet;
  
%     svmStruct = svmtrain(TrainingSet,TrainingLable','showplot',false); 
%     classes = svmclassify(svmStruct,testData,'showplot',false);
  classes = multisvm(TrainingSet, TrainingLable', testData)
%     disp('Done training');
  
set(handles.outputText,'String','--');

 if(classes==1)
  set(handles.outputText,'String','English');
 end
     
  if(classes==2)

  set(handles.outputText,'String','Marathi');
  end

    if(classes==3)
  set(handles.outputText,'String','Hindi');
  end
  
  
  



  % Conversion of logFBEs to cepstral coefficients through DCT
  CC =  DCT * log( FBE );
  
  % Cepstral lifter computation
  lifter = ceplifter( N, L );

  % Cepstral liftering gives liftered cepstral coefficients
  CC = diag( lifter ) * CC; % ~ HTK's MFCCs

  %%%%%%%%%%%%%%%%%%%% training %%%%%%%%%%%%%%%%%%%
  %to train the classifier normalize the values by taking the mean of CC;
  meanMFCC=mean(CC);                 %mean of CC 1xN
  
  set(handles.statusText,'String','Done Feature extraction');
  axes(handles.axes4);
   plot(meanMFCC);
   grid on;
  %title('mfcc normalized freq graph');
  xlabel('Samples');
  ylabel('Magnitude(db)');
  set(handles.statusText,'String','Done feature extraction');

  
  clc;
  testData=meanMFCC;
  %call svm training function
  load Traininglable;
  load TrainingSet;
  
%     svmStruct = svmtrain(TrainingSet,TrainingLable','showplot',false); 
%     classes = svmclassify(svmStruct,testData,'showplot',false);
  classes = multisvm(TrainingSet, TrainingLable', testData);
%     disp('Done training');
  
set(handles.outputText,'String','--');

 if(classes==1)
  set(handles.outputText,'String','English');
 end
  
 
  
  if(classes==2)

  set(handles.outputText,'String','Marathi');
  end

    if(classes==3)
  set(handles.outputText,'String','Hindi');
  end
  



function edit1_Callback(hObject, eventdata, handles)
% hObject   handle to edit1 (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)

% Hints: get(hObject,'String') returns contents of edit1 as text
%       str2double(get(hObject,'String')) returns contents of edit1 as a double


% --- Executes during object creation, after setting all properties.
function edit1_CreateFcn(hObject, eventdata, handles)
% hObject   handle to edit1 (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   empty - handles not created until after all CreateFcns called

% Hint: edit controls usually have a white background on Windows.
%       See ISPC and COMPUTER.
if ispc && isequal(get(hObject,'BackgroundColor'), get(0,'defaultUicontrolBackgroundColor'))
  set(hObject,'BackgroundColor','white');
end


% --------------------------------------------------------------------
function uipanel1_ButtonDownFcn(hObject, eventdata, handles)
% hObject   handle to uipanel1 (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)


% --- Executes on mouse press over figure background.
function figure1_ButtonDownFcn(hObject, eventdata, handles)
% hObject   handle to figure1 (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)


% --- If Enable == 'on', executes on mouse press in 5 pixel border.
% --- Otherwise, executes on mouse press in 5 pixel border or over trainWithFilebtn.
function trainWithFilebtn_ButtonDownFcn(hObject, eventdata, handles)
% hObject   handle to trainWithFilebtn (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)


% --- Executes when figure1 is resized.
function figure1_ResizeFcn(hObject, eventdata, handles)
% hObject   handle to figure1 (see GCBO)
% eventdata reserved - to be defined in a future version of MATLAB
% handles   structure with handles and user data (see GUIDATA)


% --- Executes on key press with focus on testWithAudioBtn and none of its controls.
function testWithAudioBtn_KeyPressFcn(hObject, eventdata, handles)
% hObject   handle to testWithAudioBtn (see GCBO)
% eventdata structure with the following fields (see UICONTROL)
%Key: name of the key that was pressed, in lower case
%Character: character interpretation of the key(s) that was pressed
%Modifier: name(s) of the modifier key(s) (i.e., control, shift) pressed
% handles   structure with handles and user data (see GUIDATA)

3 仿真结果

4 参考文献

[1]王伟, and 邓辉文. "基于MFCC参数和VQ的说话人识别系统." 第四届全国信息获取与处理学术会议 0.

 5 完整MATLAB代码与数据下载地址

见博客主页头条