1 /*---------------------------------------------------------------------------*
2 * get_fram.c *
3 * *
4 * Copyright 2007, 2008 Nuance Communciations, Inc. *
5 * *
6 * Licensed under the Apache License, Version 2.0 (the 'License'); *
7 * you may not use this file except in compliance with the License. *
8 * *
9 * You may obtain a copy of the License at *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, software *
13 * distributed under the License is distributed on an 'AS IS' BASIS, *
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 * See the License for the specific language governing permissions and *
16 * limitations under the License. *
17 * *
18 *---------------------------------------------------------------------------*/
19
20
21 #include <stdlib.h>
22 #ifndef _RTT
23 #include "pstdio.h"
24 #endif
25 #include <limits.h>
26 #include <math.h>
27 #include <string.h>
28 #include "passert.h"
29
30 #include "c42mul.h"
31 #include "portable.h"
32
33 #include "../clib/fpi_tgt.inl"
34
35 #define DEBUG 0
36 #define FUDGE_FACTOR 1.2f
37
38 const float root_pi_over_2 = (float) 1.2533141;
39
40 static const char get_fram[] = "$Id: get_fram.c,v 1.7.6.13 2007/10/15 18:06:24 dahan Exp $";
41
42 static void create_cepstrum_offsets(preprocessed *prep);
43 static void destroy_cepstrum_offsets(preprocessed *prep);
44 static void apply_channel_offset(preprocessed *prep);
45 static int compare_cached_frame(preprocessed *prep, utterance_info *utt);
46
init_utterance(utterance_info * utt,int utt_type,int dimen,int buffer_size,int keep_frames,int num_chan,int do_voicing)47 void init_utterance(utterance_info *utt, int utt_type, int dimen,
48 int buffer_size, int keep_frames, int num_chan, int do_voicing)
49 /*
50 ** To setup the utterance structure
51 */
52 {
53 /* Construct frame buffer and voice buffer here
54 */
55 ASSERT(utt);
56 ASSERT(dimen > 0);
57 if (buffer_size < keep_frames)
58 SERVICE_ERROR(BAD_ARGUMENT);
59 utt->utt_type = utt_type;
60 utt->gen_utt.dim = dimen;
61 utt->gen_utt.frame = createFrameBuffer(buffer_size,
62 dimen, keep_frames, do_voicing);
63 utt->gen_utt.num_chan = num_chan;
64
65 setup_ambient_estimation(utt->gen_utt.backchan,
66 utt->gen_utt.num_chan, 100);
67 return;
68 }
69
set_voicing_durations(utterance_info * utt,int voice_duration,int quiet_duration,int unsure_duration,int start_windback)70 void set_voicing_durations(utterance_info *utt, int voice_duration,
71 int quiet_duration, int unsure_duration,
72 int start_windback)
73 {
74 utt->gen_utt.voice_duration = voice_duration;
75 utt->gen_utt.quiet_duration = quiet_duration;
76 utt->gen_utt.unsure_duration = unsure_duration;
77 utt->gen_utt.start_windback = start_windback;
78 return;
79 }
80
free_utterance(utterance_info * utt)81 void free_utterance(utterance_info *utt)
82 /*
83 ** To close data file pointers etc.
84 */
85 {
86 /* Destroy frame buffer
87 */
88 ASSERT(utt);
89
90 clear_ambient_estimation(utt->gen_utt.backchan, utt->gen_utt.dim);
91 if (utt->gen_utt.frame)
92 {
93 destroyFrameBuffer(utt->gen_utt.frame);
94 utt->gen_utt.frame = NULL;
95 }
96 return;
97 }
98
init_preprocessed(preprocessed * prep,int dimen,float imelda_scale)99 void init_preprocessed(preprocessed *prep, int dimen, float imelda_scale)
100 /*
101 ** To setup the preprocessed structure
102 */
103 {
104
105 ASSERT(prep);
106 ASSERT(dimen > 0);
107 prep->dim = dimen;
108 prep->seq = (imeldata *) CALLOC(prep->dim, sizeof(imeldata),
109 "srec.prep->seq");
110 prep->seq_unnorm = (imeldata *) CALLOC(prep->dim, sizeof(imeldata),
111 "srec.prep->seq_unnorm");
112 prep->last_frame = (featdata *) CALLOC(prep->dim, sizeof(featdata),
113 "srec.prep->last_frame");
114
115 /* Setup constants for distance calculation
116 */
117 /* TODO: check numbers for non-zero */
118 prep->add.scale = (prdata)((2 * imelda_scale * imelda_scale) / MUL_SCALE
119 + 0.5) - (prdata)0.5;
120 prep->add.inv_scale = (prdata)(((float)(0x01 << 12) * MUL_SCALE) /
121 (2 * imelda_scale * imelda_scale) + 0.5) -
122 (prdata)0.5;
123 prep->mul.multable_factor_gaussian = 1;
124 prep->mul.multable_factor = (prdata)(((MUL_SCALE * (0x01 << EUCLID_SHIFT)
125 * prep->uni_score_scale)
126 / (2 * (imelda_scale * imelda_scale
127 * FUDGE_FACTOR * FUDGE_FACTOR))) / 128 + 0.5)
128 - (prdata)0.5;
129 prep->mul.grand_mod_cov = (prdata)((MUL_SCALE * prep->uni_score_scale *
130 prep->whole_dim *
131 log((imelda_scale * FUDGE_FACTOR) /
132 (SIGMA_BIAS * root_pi_over_2))) / 128 + 0.5)
133 - (prdata)0.5 - prep->uni_score_offset;
134 prep->mul.grand_mod_cov_gaussian = (prdata)(2 * imelda_scale * imelda_scale *
135 prep->use_dim *
136 log(imelda_scale /
137 (SIGMA_BIAS * root_pi_over_2)) + 0.5)
138 - (prdata)0.5;
139 #if DEBUG
140 log_report("grand_mod_cov %.1f, grand_mod_cov_gaussian %.1f\n",
141 (float)prep->mul.grand_mod_cov,
142 (float)prep->mul.grand_mod_cov_gaussian);
143 log_report("multable_factor %f, multable_factor_gaussian %f\n",
144 (float)prep->mul.multable_factor,
145 (float)prep->mul.multable_factor_gaussian);
146 #endif
147
148
149 create_cepstrum_offsets(prep);
150 return;
151 }
152
clear_preprocessed(preprocessed * prep)153 void clear_preprocessed(preprocessed *prep)
154 /*
155 ** To setup the preprocessed structure
156 */
157 {
158 ASSERT(prep);
159 destroy_cepstrum_offsets(prep);
160 prep->dim = 0;
161 FREE((char *)prep->last_frame);
162 FREE((char *)prep->seq);
163 FREE((char *)prep->seq_unnorm);
164 return;
165 }
166
get_data_frame(preprocessed * prep,utterance_info * utt)167 int get_data_frame(preprocessed *prep, utterance_info *utt)
168 /*
169 ** To get a frame amount of data and perform preprocessing functions
170 */
171 {
172 int status_code;
173
174 ASSERT(prep);
175 ASSERT(utt);
176 if (utt->gen_utt.channorm && !utt->gen_utt.channorm->adj_valid)
177 convert_adjustment_to_imelda(utt->gen_utt.channorm, prep);
178 if (utt->gen_utt.dim != prep->dim)
179 SERVICE_ERROR(UTTERANCE_DIMEN_MISMATCH);
180
181 if (prep->post_proc & VFR)
182 {
183 if ((status_code = get_utterance_frame(prep, utt)) <= 0)
184 return (status_code);
185
186 log_report("get_data_frame vfr not supported\n");
187 SERVICE_ERROR(FEATURE_NOT_SUPPORTED);
188 }
189 else
190 {
191 status_code = get_utterance_frame(prep, utt);
192 if (status_code == 0) return(status_code);
193 else if (status_code == -1) return(1);
194 }
195
196 if (prep->chan_offset)
197 apply_channel_offset(prep);
198
199 /* Apply linear transformation if necessary
200 */
201 if (prep->post_proc & LIN_TRAN)
202 linear_transform_frame(prep, prep->seq, True);
203
204 memcpy(prep->seq_unnorm, prep->seq, prep->dim * sizeof(imeldata));
205 if (utt->gen_utt.channorm)
206 apply_channel_normalization_in_imelda(utt->gen_utt.channorm,
207 prep->seq, prep->seq_unnorm,
208 utt->gen_utt.channorm->dim);
209 return (1);
210 }
211
get_utterance_frame(preprocessed * prep,utterance_info * utt)212 int get_utterance_frame(preprocessed *prep, utterance_info *utt)
213 /*
214 ** To get a frame amount of data
215 ** Maintains a single data buffer and passes the pointers to frame of data.
216 ** Post-increments after copying
217 */
218 {
219 featdata *frame_ptr;
220 int ii;
221
222 ASSERT(prep);
223 ASSERT(utt);
224
225 /* Get the next data frame in
226 */
227 if (getFrameGap(utt->gen_utt.frame) > 0)
228 {
229 /* is it a cloned object */
230 if (prep->ref_count > 1 && compare_cached_frame(prep, utt))
231 return (-1);
232
233 frame_ptr = currentRECframePtr(utt->gen_utt.frame);
234 if (frame_ptr == NULL)
235 return (0);
236 if (prep->ref_count > 1)
237 {
238 ASSERT(prep->last_frame);
239 memcpy(prep->last_frame, frame_ptr,
240 prep->dim* sizeof(featdata));
241 }
242 for (ii = 0; ii < utt->gen_utt.dim; ii++)
243 prep->seq[ii] = (imeldata)frame_ptr[ii];
244 /* Apply fast-voice corrections if necessary */
245 if (utt->gen_utt.frame->haveVoiced)
246 {
247 utterance_detection_fixup(utt->gen_utt.frame,
248 &utt->gen_utt.last_push, utt->gen_utt.voice_duration,
249 utt->gen_utt.quiet_duration, utt->gen_utt.unsure_duration);
250 /* if (isFrameBufferActive (utt->gen_utt.frame)
251 && getFrameGap (utt->gen_utt.frame) <= utt->gen_utt.quiet_duration)
252 SERVICE_ERROR (INTERNAL_ERROR); */
253 prep->voicing_status =
254 rec_frame_voicing_status(utt->gen_utt.frame);
255 }
256 return (1);
257 }
258 return (0);
259 }
260
261
advance_utterance_frame(utterance_info * utt)262 int advance_utterance_frame(utterance_info *utt)
263 /*
264 ** To get a frame amount of data
265 */
266 {
267 ASSERT(utt);
268 /* if more samples are needed then read from file if the type matched
269 */
270 /* Get the next data frame in
271 */
272 if (getFrameGap(utt->gen_utt.frame) > 0)
273 {
274 if (incRECframePtr(utt->gen_utt.frame) != False)
275 return (0);
276 return (1);
277 }
278 return (0);
279 }
280
retreat_utterance_frame(utterance_info * utt)281 int retreat_utterance_frame(utterance_info *utt)
282 /*
283 ** To get a frame amount of data
284 */
285 {
286 ASSERT(utt);
287
288 if (getBlockGap(utt->gen_utt.frame) > 0)
289 {
290 if (decRECframePtr(utt->gen_utt.frame) != False)
291 return (0);
292 return (1);
293 }
294 return (0);
295 }
296
prepare_data_frame(preprocessed * prep)297 void prepare_data_frame(preprocessed *prep)
298 {
299 int ii;
300 prdata sum_sq;
301
302 sum_sq = 0;
303
304 for (ii = 0; ii < prep->whole_dim; ii++)
305 sum_sq += (prdata) SQR((prdata)prep->seq[ii]);
306 prep->seq_sq_sum_whole = -sum_sq;
307
308 ASSERT(prep->whole_dim <= prep->use_dim);
309 for (ii = 0; ii < prep->use_dim; ii++)
310 sum_sq += (prdata) SQR((prdata)prep->seq[ii]);
311 prep->seq_sq_sum = -sum_sq;
312
313 sum_sq = 0;
314
315 for (ii = 0; ii < prep->whole_dim; ii++)
316 sum_sq += (prdata) SQR((prdata)prep->seq_unnorm[ii]);
317 prep->seq_unnorm_sq_sum_whole = -sum_sq;
318
319 return;
320 }
321
utterance_started(utterance_info * utt)322 int utterance_started(utterance_info *utt)
323 {
324 ASSERT(utt);
325 if (utt->gen_utt.frame->haveVoiced
326 && utt->gen_utt.frame->voicingDetected)
327 return (True);
328 else
329 return (False);
330 }
331
utterance_ended(utterance_info * utt)332 int utterance_ended(utterance_info *utt)
333 {
334 ASSERT(utt);
335 return (utt->gen_utt.frame->utt_ended);
336 }
337
load_utterance_frame(utterance_info * utt,unsigned char * pUttFrame,int voicing)338 int load_utterance_frame(utterance_info *utt, unsigned char* pUttFrame, int voicing)
339 {
340 featdata framdata[MAX_DIMEN];
341 int ii;
342
343 ASSERT(utt);
344 ASSERT(pUttFrame);
345
346 for (ii = 0; ii < utt->gen_utt.frame->uttDim; ii++)
347 framdata[ii] = (featdata) pUttFrame[ii];
348
349 if (pushSingleFEPframe(utt->gen_utt.frame, framdata, voicing) != False)
350 return (0);
351
352 return (1);
353 }
354
copy_utterance_frame(utterance_info * oututt,utterance_info * inutt)355 int copy_utterance_frame(utterance_info *oututt, utterance_info *inutt)
356 {
357 int voicedata;
358 featdata *framdata;
359
360 ASSERT(oututt);
361 ASSERT(inutt);
362
363 if ((framdata = currentRECframePtr(inutt->gen_utt.frame)) == NULL)
364 return (0);
365
366 voicedata = getVoicingCode(inutt->gen_utt.frame, framdata);
367
368 if (pushSingleFEPframe(oututt->gen_utt.frame, framdata, voicedata) != False)
369 return (0);
370
371 return (1);
372 }
373
copy_pattern_frame(utterance_info * oututt,preprocessed * prep)374 int copy_pattern_frame(utterance_info *oututt, preprocessed *prep)
375 {
376 int ii;
377 featdata frame_ptr[MAX_DIMEN];
378
379 ASSERT(oututt);
380 ASSERT(prep);
381 ASSERT(oututt->gen_utt.dim < MAX_DIMEN);
382 for (ii = 0; ii < oututt->gen_utt.dim; ii++)
383 frame_ptr[ii] = (featdata) RANGE(prep->seq[ii], 0, 255);
384 if (pushSingleFEPframe(oututt->gen_utt.frame, frame_ptr,
385 prep->voicing_status)
386 != False) return(0);
387 return (1);
388 }
389
create_cepstrum_offsets(preprocessed * prep)390 static void create_cepstrum_offsets(preprocessed *prep)
391 {
392 ASSERT(prep);
393 prep->chan_offset = (imeldata *) CALLOC_CLR(prep->dim,
394 sizeof(imeldata), "srec.chan_offset");
395 return;
396 }
397
set_cepstrum_offset(preprocessed * prep,int index,int value)398 void set_cepstrum_offset(preprocessed *prep, int index, int value)
399 {
400 ASSERT(prep);
401 ASSERT(prep->chan_offset);
402 ASSERT(index >= 0 && index < prep->dim);
403 prep->chan_offset[index] = (imeldata) value;
404 return;
405 }
406
destroy_cepstrum_offsets(preprocessed * prep)407 static void destroy_cepstrum_offsets(preprocessed *prep)
408 {
409 ASSERT(prep);
410 FREE((char *)prep->chan_offset);
411 prep->chan_offset = 0;
412 return;
413 }
414
apply_channel_offset(preprocessed * prep)415 static void apply_channel_offset(preprocessed *prep)
416 {
417 int ii;
418
419 for (ii = 0; ii < prep->dim; ii++)
420 prep->seq[ii] += prep->chan_offset[ii];
421 return;
422 }
423
compare_cached_frame(preprocessed * prep,utterance_info * utt)424 static int compare_cached_frame(preprocessed *prep, utterance_info *utt)
425 {
426 int ii;
427 featdata *frame_ptr;
428
429 frame_ptr = currentRECframePtr(utt->gen_utt.frame);
430 if (frame_ptr == NULL)
431 return (False);
432 for (ii = 0; ii < utt->gen_utt.dim; ii++)
433 if (prep->last_frame[ii] != frame_ptr[ii])
434 return (False);
435 return (True);
436 }
437
convert_adjustment_to_imelda(norm_info * norm,preprocessed * prep)438 void convert_adjustment_to_imelda(norm_info *norm, preprocessed *prep)
439 {
440 int ii;
441 imeldata fram[MAX_DIMEN];
442
443 ASSERT(prep);
444 ASSERT(norm);
445 for (ii = 0; ii < 12; ii++) /* TODO: fix dimension properly, and sort out rouding/type */
446 fram[ii] = (imeldata) norm->adjust[ii]; /* TODO: review types */
447 for (; ii < prep->dim; ii++)
448 fram[ii] = 0;
449
450 linear_transform_frame(prep, fram, False);
451
452 for (ii = 0; ii < prep->dim; ii++)
453 norm->imelda_adjust[ii] = fram[ii];
454 #if DEBUG
455 log_report("NORM AUX: ");
456 for (ii = 0; ii < norm->dim; ii++)
457 log_report("%d ", (int)norm->imelda_adjust[ii]);
458 log_report("\n");
459 #endif
460 norm->adj_valid = True;
461 return;
462 }
463