-
Notifications
You must be signed in to change notification settings - Fork 1
/
simpleSentenceRecognition.m
129 lines (113 loc) · 3.54 KB
/
simpleSentenceRecognition.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
%% simple sentence classification using discrete HMM
% this script demostrate the language model framework using discrete HMM.
% including how to train HMM
% how to coding sentences to word numeric data
% how to evaluate HMMs using Viterbi algorithm
%
% input format
% sentence_1, intent
% sentence_2, intent
% ...
% example
% Hello world, greeting
% Hi how do you do , greeting
% Cloud you run a script for me, run_script
% please run script, run_script
% pleas tell me the result of script, show_script
%
% Dependencies:
%
% Rattaphon Hokking([email protected]) 21/4/2016
%
dataSet = importdata('F:\Project\Alice\sentence.txt');
sentence = {};
intent = {};
%% split sentence and intent
for i = 1:size(dataSet, 1)
temp = regexp(dataSet{i},':','split');
sentence(i) = {temp(1)};
intent(i) = strrep(temp(2),' ', '');
end
%% create system dictionary
dict = {};
for i = 1:size(sentence, 2)
temp = regexp(sentence{i},' ','split');
dict = [dict temp{:}];
end
dict = unique(dict);
%% create intent dictionary
intentDict = unique(intent);
%% HMM model for sequence
% init hmm
hmm = {};
nState = 3;
nObserver = size(dict, 2);
nModel = size(intentDict, 2);
for i = 1:nModel
% random guess model, prevent zero transition
transition = rand(nState, nState);
emission = rand(nState, nObserver);
% norm range to 1. each state must trans to another states
transition = transition./ repmat(sum(transition, 2), 1, nState);
emission = emission./ repmat(sum(emission, 2), 1, nObserver);
% store the model
hmm{i} = [{transition},{emission}];
end
%% define sentence coding
% code word to numeric
code = @(w,dictionary) find(not(cellfun('isempty', strfind(dictionary, w))));
%% hmm training
for i = 1:size(sentence, 2)
% encode sentence to idx sequence
words = regexp(sentence{i},' ','split');
words = words{1};
seq = zeros(size(words, 2),1);
% map words to numeric sequence
for j = 1:size(words, 2)
wordIdx = code(words{j}, dict);
seq(j) = wordIdx;
end
% find the expect intent
intentIdx = code(intent{j}, intentDict);
% load previous model
ESTTR = hmm{intentIdx}{1};
ESTEMIT = hmm{intentIdx}{2};
% peudotransiiton parameters to prevent zero divide, zero transition
pseudoEmit = ESTEMIT;
pseudoTr = ESTTR;
%% find new transition and emission
[ESTTR,ESTEMIT] = hmmtrain( seq', ...
ESTTR,ESTEMIT,...
'algorithm','viterbi',...
'PSEUDOTRANSITIONS',pseudoTr, ...
'PSEUDOEMISSIONS',pseudoEmit);
hmm{intentIdx} = [{ESTTR},{ESTEMIT}];
end
%% evaluate HMM
labelVec = zeros(size(sentence, 2), 1);
LOGPSEQBuffer = zeros(size(hmm,2), size(sentence, 2));
for i = 1:size(sentence, 2)
% encode sentence to idx sequence
words = regexp(sentence{i},' ','split');
words = words{1};
seq = zeros(size(words, 2),1);
for j = 1:size(words, 2)
wordIdx = code(words{j}, dict);
seq(j) = wordIdx;
end
intentIdx = code(intent{j}, intentDict);
labelVec(i) = intentIdx;
% compare log likelihood of the sequence with models
for mIdx = 1:size(hmm,2)
model = hmm{mIdx}; % point to model
ESTTR = model{1};
ESTEMIT = model{2};
% find probability of single sequence
[~, logProbSeq] = hmmviterbi(seq',ESTTR,ESTEMIT);
LOGPSEQBuffer(mIdx,i) = logProbSeq;
end
end
%% find the maximum log likelihood for each sequence
[~,outClass] = max(LOGPSEQBuffer);
%% calculate accuracy rate
accRate = sum(outClass == labelVec')/size(labelVec',2);