1 /*---------------------------------------------------------------------------*
2 * swicms.c *
3 * *
4 * Copyright 2007, 2008 Nuance Communciations, Inc. *
5 * *
6 * Licensed under the Apache License, Version 2.0 (the 'License'); *
7 * you may not use this file except in compliance with the License. *
8 * *
9 * You may obtain a copy of the License at *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, software *
13 * distributed under the License is distributed on an 'AS IS' BASIS, *
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 * See the License for the specific language governing permissions and *
16 * limitations under the License. *
17 * *
18 *---------------------------------------------------------------------------*/
19
20 #include <string.h>
21 #include"swicms.h"
22 #include"srec_sizes.h"
23 #include"prelib.h"
24
25 #include "passert.h"
26 #include "ESR_Session.h"
27 #include "ESR_SessionType.h"
28 #include "IntArrayList.h"
29 #include "portable.h"
30
31 #define printf_vector(HEAD, FMT, PTR, NN) { int i; LCHAR buffer[256]; sprintf(buffer, HEAD); sprintf(buffer + LSTRLEN(buffer), " %x", (int)PTR); for (i=0; i<(NN); ++i) sprintf(buffer + LSTRLEN(buffer), FMT, PTR[i]); PLogMessage(buffer); }
32
33 /* Cross-utterance CMN calculation:
34 We try to normalize the speech frames before they get to the recognizer.
35 The speech frames are LDA-processed mfcc-with-dynamic feature vectors.
36 We collect these speech frames during recognition. At the end of
37 recognition we exclude the silence frames from the collected data, and
38 generate a new channel average based on the previous average and the new
39 data, using an exponential decay formula.
40
41 In-utterance CMN calculation:
42 A new short-term average mechanism was introduced, with faster update,
43 to improve recognition on the very first recognition after init or reset.
44 We wait for a minimum number of new data frames to apply this. We also
45 disable the fast updater after some frames, because we assume the
46 cross-utterance estimator to be more reliable, particularly in its
47 ability to exclude silence frames from the calculation.
48 */
49
50 /* default settings for cross-utterance cms */
51 #define SWICMS_FORGET_FACTOR_DEFAULT 400 /* effective frms of history */
52 #define SWICMS_SBINDEX_DEFAULT 100 /* use speech frames only */
53 /* #define SWICMS_CACHE_RESOLUTION_DEFAULT see swicms.h */
54 /* #define SWICMS_CACHE_SIZE_DEFAULT see swicms.h */
55
56 /* default settings for in-utterance cms */
57 #define SWICMS_INUTT_FORGET_FACTOR2_DISABLE 65535 /* any large number */
58 #define SWICMS_INUTT_FORGET_FACTOR2_DEFAULT SWICMS_INUTT_FORGET_FACTOR2_DISABLE
59 /* disable this when cross-utt become more reliable */
60 #define SWICMS_INUTT_DISABLE_AFTER_FRAMES 200
61 /* wait while the estimate is poor */
62 #define SWICMS_INUTT_ENABLE_AFTER_FRAMES 10
63
64 /**
65 * Logging Stuff
66 */
67 #define LOG_LEVEL 2
68 #define MODULE_NAME L("swicms.c")
69 //static const char* MTAG = MODULE_NAME;
70
71 static const char *rcsid = 0 ? (const char *) &rcsid :
72 "$Id: swicms.c,v 1.21.6.16 2008/06/05 19:00:55 stever Exp $";
73
74 static ESR_BOOL SWICMS_DEBUG = ESR_FALSE;
75
76 /* these are good values from cmn/tmn files */
77 static const imeldata gswicms_cmn1_8 [MAX_CHAN_DIM] =
78 {
79 158, 141, 99, 125, 101, 162, 113, 138, 128, 143, 123, 141,
80 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
81 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127
82 };
83
84 static const imeldata gswicms_cmn1_11 [MAX_CHAN_DIM] =
85 {
86 163, 121, 120, 114, 124, 139, 144, 108, 150, 119, 146, 124,
87 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
88 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127
89 };
90
91 static const imeldata gswicms_tmn1_8 [MAX_CHAN_DIM] =
92 {
93 108, 138, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
94 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
95 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127
96 };
97
98 static const imeldata gswicms_tmn1_11 [MAX_CHAN_DIM] =
99 {
100 108, 138, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
101 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
102 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127
103 };
104
GetSomeIntsIfAny(const LCHAR * parname,imeldata * parvalue,size_t reqSize)105 static ESR_ReturnCode GetSomeIntsIfAny( const LCHAR* parname, imeldata* parvalue, size_t reqSize)
106 {
107 size_t i, size;
108 ESR_ReturnCode rc;
109 ESR_BOOL exists;
110 IntArrayList* intList = 0;
111
112 CHKLOG(rc, ESR_SessionContains(parname, &exists));
113 if (exists) {
114 rc = ESR_SessionGetProperty(parname, (void**)&intList, TYPES_INTARRAYLIST);
115 if (rc != ESR_SUCCESS && rc != ESR_NO_MATCH_ERROR) {
116 /* no match will revert to default data already in static array */
117 PLogError(L("Error reading %s from session: %s"), parname, ESR_rc2str(rc));
118 return ESR_FATAL_ERROR;
119 }
120 else if (rc == ESR_SUCCESS) {
121 CHKLOG(rc, IntArrayListGetSize(intList, &size));
122 if(size != reqSize) {
123 PLogError(L("Error reading %s from session, expected len %d: %s"), parname, reqSize, ESR_rc2str(rc));
124 return ESR_FATAL_ERROR;
125 }
126 if(reqSize == 1)
127 CHKLOG(rc, IntArrayListGet(intList, 0, parvalue));
128 else {
129 for (i=0; i<size; ++i)
130 CHKLOG(rc, IntArrayListGet(intList, i, &parvalue[i]));
131 }
132 }
133 }
134 return ESR_SUCCESS;
135 CLEANUP:
136 return rc;
137 }
138
swicms_init(swicms_norm_info * swicms)139 int swicms_init(swicms_norm_info* swicms)
140 {
141 ESR_ReturnCode rc = ESR_SUCCESS;
142 size_t i;
143 ESR_BOOL exists, sessionExists;
144 size_t sample_rate;
145
146 /* defaults */
147 swicms->sbindex = SWICMS_SBINDEX_DEFAULT;
148 swicms->cached_num_frames = 0;
149 swicms->forget_factor = SWICMS_FORGET_FACTOR_DEFAULT;
150 swicms->cache_resolution = SWICMS_CACHE_RESOLUTION_DEFAULT;
151 swicms->num_frames_in_cmn = 0;
152
153 CHKLOG(rc, ESR_SessionExists(&sessionExists));
154
155 if (sessionExists)
156 { /* We'll assume this rate is valid or someone else will be complaining. SteveR */
157 rc = ESR_SessionGetSize_t ( L ( "CREC.Frontend.samplerate" ), &sample_rate );
158
159 if ( rc != ESR_SUCCESS )
160 return ( rc );
161 }
162 else
163 sample_rate = 11025;
164
165 /* init the data structures by copying the static data so that we can have a copy if we need to reset */
166 if ( sample_rate == 8000 )
167 {
168 for ( i = 0; i < MAX_CHAN_DIM; i++ )
169 {
170 swicms->cmn [i] = gswicms_cmn1_8 [i];
171 swicms->tmn [i] = gswicms_tmn1_8 [i];
172 // _lda_*mn below are OK, but are recalculated in swicms_lda_process()
173 swicms->lda_cmn [i] = 0; /* calculated by swicms_lda_process() */
174 swicms->lda_tmn [i] = 0; /* calculated by swicms_lda_process() */
175 }
176 }
177 else
178 {
179 for ( i = 0; i < MAX_CHAN_DIM; i++ )
180 {
181 swicms->cmn [i] = gswicms_cmn1_11 [i];
182 swicms->tmn [i] = gswicms_tmn1_11 [i];
183 // _lda_*mn below are OK, but are recalculated in swicms_lda_process()
184 swicms->lda_cmn [i] = 0; /* calculated by swicms_lda_process() */
185 swicms->lda_tmn [i] = 0; /* calculated by swicms_lda_process() */
186 }
187 }
188 CHKLOG(rc, ESR_SessionExists(&sessionExists));
189
190 if (sessionExists)
191 {
192 const LCHAR* parname = L("CREC.Frontend.swicms.debug");
193 CHKLOG(rc, ESR_SessionContains(parname, &exists));
194 if (exists) {
195 rc = ESR_SessionGetBool(parname, &SWICMS_DEBUG);
196 if (rc != ESR_SUCCESS && rc != ESR_NO_MATCH_ERROR) {
197 PLOG_DBG_ERROR((L("Error reading %s from session: %s"), parname, ESR_rc2str(rc)));
198 return rc;
199 }
200 }
201
202 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.forget_factor"),
203 &swicms->forget_factor, 1);
204 if(rc != ESR_SUCCESS) return rc;
205
206 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.sbindex"),
207 &swicms->sbindex, 1);
208 if(rc != ESR_SUCCESS) return rc;
209
210 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn"),
211 &swicms->cmn[0], MAX_CHAN_DIM);
212 if(rc != ESR_SUCCESS) return rc;
213
214 if ( sample_rate == 8000 )
215 {
216 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn8"), &swicms->cmn[0], MAX_CHAN_DIM);
217
218 if(rc != ESR_SUCCESS)
219 return rc;
220 }
221 else
222 {
223 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn11"), &swicms->cmn[0], MAX_CHAN_DIM);
224
225 if(rc != ESR_SUCCESS)
226 return rc;
227 }
228
229 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.tmn"),
230 &swicms->tmn[0], MAX_CHAN_DIM);
231 if(rc != ESR_SUCCESS) return rc;
232 }
233
234 swicms->is_valid = 0;
235 for (i = 0; i < MAX_CHAN_DIM; i++)
236 swicms->adjust[i] = 255;
237
238 #ifdef SREC_ENGINE_VERBOSE_LOGGING
239 PLogMessage("swicms->forget_factor = %d\n", swicms->forget_factor);
240 PLogMessage("swicms->cache_resolution = %d\n", swicms->cache_resolution);
241 PLogMessage("swicms->sbindex = %d\n", swicms->sbindex);
242 #endif
243
244 /* in-utt cms parameters */
245 swicms->inutt.forget_factor2 = SWICMS_INUTT_FORGET_FACTOR2_DEFAULT;
246 swicms->inutt.disable_after = 200;
247 swicms->inutt.enable_after = 10; /* in-utt is less reliable */
248 swicms->inutt.num_bou_frames_to_skip = 20; /* silence frames! see windback */
249 swicms->inutt.num_frames_since_bou = 0;
250 swicms->inutt.num_frames_in_accum = 0;
251 for(i=0; i<MAX_CHAN_DIM; i++) swicms->inutt.accum[i] = 0;
252
253 if (sessionExists) {
254 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.forget_factor2"),
255 &swicms->inutt.forget_factor2, 1);
256 if(rc != ESR_SUCCESS) return rc;
257
258 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.disable_after"),
259 &swicms->inutt.disable_after, 1);
260 if(rc != ESR_SUCCESS) return rc;
261
262 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.enable_after"),
263 &swicms->inutt.enable_after, 1);
264 if(rc != ESR_SUCCESS) return rc;
265
266 /* we need to estimate the in-utt cmn from speech frames only! so let's
267 make sure to skip some frames before collecting data, */
268 ESR_SessionContains(L("CREC.Frontend.start_windback"), &exists);
269 if (exists) {
270 ESR_BOOL do_skip_even_frames = ESR_TRUE;
271 ESR_SessionGetBool(L("CREC.Frontend.do_skip_even_frames"), &do_skip_even_frames);
272 ESR_SessionGetInt(L("CREC.Frontend.start_windback"), &swicms->inutt.num_bou_frames_to_skip);
273 if( do_skip_even_frames)
274 swicms->inutt.num_bou_frames_to_skip /= 2;
275 swicms->inutt.num_bou_frames_to_skip -= 5; /* ensure spch frames only */
276 }
277 }
278
279 return 0;
280 CLEANUP:
281 return rc;
282 }
283
284
swicms_get_cmn(swicms_norm_info * swicms,LCHAR * cmn_params,size_t * len)285 ESR_ReturnCode swicms_get_cmn ( swicms_norm_info* swicms, LCHAR *cmn_params, size_t* len )
286 {
287 int dim_count;
288 int i;
289 imeldata temp[MAX_CHAN_DIM];
290 const size_t INT_LENGTH = 12;
291
292 if ( swicms->_prep != NULL ) /* lda exists give them transformed lda. */
293 {
294 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ )
295 temp [dim_count] = swicms->lda_cmn [dim_count];
296 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/);
297 }
298 else /* lda does not exist give them raw cmn values */
299 {
300 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ )
301 temp [dim_count] = swicms->cmn [dim_count];
302 }
303
304 for ( dim_count = 0, i = 0; dim_count < MAX_CHAN_DIM; dim_count++ )
305 {
306 i += sprintf( cmn_params + i, dim_count==0 ? "%d" : ",%d", temp [dim_count] );
307 if (i + INT_LENGTH >= *len) {
308 *len = MAX_CHAN_DIM * (INT_LENGTH + 2) * sizeof(LCHAR);
309 return ESR_BUFFER_OVERFLOW;
310 }
311 }
312
313 return ESR_SUCCESS;
314 }
315
316
swicms_set_cmn(swicms_norm_info * swicms,const char * cmn_params)317 ESR_ReturnCode swicms_set_cmn ( swicms_norm_info* swicms, const char *cmn_params )
318 {
319 ESR_ReturnCode set_status;
320 int length_of_params;
321 int dim_count;
322 int got_word;
323 int current_position;
324 char *copy_of_params;
325 char *parsed_strings [MAX_CHAN_DIM];
326 int temp_cmn [MAX_CHAN_DIM];
327
328 length_of_params = strlen ( cmn_params ) + 1;
329 copy_of_params = (char*)MALLOC ( length_of_params, NULL );
330
331 if ( copy_of_params != NULL )
332 {
333 set_status = ESR_SUCCESS;
334 memcpy ( copy_of_params, cmn_params, length_of_params );
335 dim_count = 0;
336 current_position = 0;
337 got_word = 0;
338 parsed_strings [dim_count] = copy_of_params + current_position;
339
340 while ( ( dim_count < MAX_CHAN_DIM ) && ( set_status == ESR_SUCCESS ) )
341 {
342 switch ( *( copy_of_params + current_position ) )
343 {
344 case '\0':
345 if ( got_word == 1 )
346 {
347 if ( dim_count == ( MAX_CHAN_DIM - 1 ) )
348 dim_count++;
349 else
350 {
351 PLogError ( "Channel Normalization : Missing Params Must Contain %d Params\n", MAX_CHAN_DIM );
352 set_status = ESR_INVALID_ARGUMENT;
353 }
354 }
355 else
356 {
357 PLogError ( "Channel Normalization : Missing Params Mus Contain %d Params\n", MAX_CHAN_DIM );
358 set_status = ESR_INVALID_ARGUMENT;
359 }
360 break;
361
362 case ',':
363 if ( got_word == 1 )
364 {
365 if ( dim_count < ( MAX_CHAN_DIM - 1 ) )
366 {
367 dim_count++;
368 *( copy_of_params + current_position) = '\0';
369 current_position++;
370
371 if ( current_position == length_of_params )
372 {
373 PLogError ( "Channel Normalization : Delimiter At End Of Param String\n" );
374 set_status = ESR_INVALID_ARGUMENT;
375 }
376 parsed_strings [dim_count] = copy_of_params + current_position;
377 got_word = 0;
378 }
379 else
380 {
381 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM );
382 set_status = ESR_INVALID_ARGUMENT;
383 }
384 }
385 else
386 {
387 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM );
388 set_status = ESR_INVALID_ARGUMENT;
389 }
390 break;
391
392 case '0':
393 case '1':
394 case '2':
395 case '3':
396 case '4':
397 case '5':
398 case '6':
399 case '7':
400 case '8':
401 case '9':
402 got_word = 1;
403 current_position++;
404
405 if ( current_position == length_of_params )
406 {
407 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM );
408 set_status = ESR_INVALID_ARGUMENT;
409 }
410 break;
411
412 default:
413 PLogError ( "Channel Normalization : Invalid Param : %c : Params Must Contain Only Digits\n" );
414 set_status = ESR_INVALID_ARGUMENT;
415 break;
416 }
417 }
418 if ( set_status == ESR_SUCCESS )
419 {
420 dim_count = 0;
421
422 while ( ( dim_count < MAX_CHAN_DIM ) && ( set_status == ESR_SUCCESS ) )
423 {
424 temp_cmn [dim_count] = atoi ( parsed_strings [dim_count] );
425
426 if ( ( temp_cmn [dim_count] < 0 ) || ( temp_cmn [dim_count] > 255 ) )
427 {
428 set_status = ESR_INVALID_ARGUMENT;
429 }
430 }
431 if ( set_status == ESR_SUCCESS )
432 {
433 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ )
434 swicms->cmn [dim_count] = temp_cmn [dim_count];
435 if ( swicms->_prep != NULL ) /* Set now if NULL it will automatically be set on first utterance */
436 linear_transform_frame(swicms->_prep, swicms->lda_cmn, 1 /*do_shift*/);
437 }
438 }
439 FREE ( copy_of_params );
440 }
441 else
442 {
443 PLogError ( "Channel Normalization Out Of Memory Error\n" );
444 set_status = ESR_OUT_OF_MEMORY;
445 }
446 swicms->num_frames_in_cmn = 0;
447 return ( set_status );
448 }
449
450
swicms_cache_frame(swicms_norm_info * swicms,imeldata * frame,int dimen)451 int swicms_cache_frame(swicms_norm_info* swicms, imeldata* frame, int dimen)
452 {
453 int i;
454 imeldata *pcache, *pframe;
455
456 ASSERT(dimen == MAX_CHAN_DIM);
457 i = swicms->cached_num_frames / swicms->cache_resolution;
458 if (i < SWICMS_CACHE_SIZE_DEFAULT)
459 {
460 pcache = swicms->cached_sections[ i];
461 if (swicms->cached_num_frames % swicms->cache_resolution == 0)
462 {
463 for (i = 0; i < MAX_CHAN_DIM; i++) *pcache++ = 0;
464 pcache -= MAX_CHAN_DIM;
465 }
466 pframe = frame;
467 for (i = 0; i < MAX_CHAN_DIM; i++) *pcache++ += *pframe++;
468 swicms->cached_num_frames++;
469 }
470
471 return 0;
472 }
473
apply_channel_normalization_in_swicms(swicms_norm_info * swicms,imeldata * oframe,imeldata * iframe,int dimen)474 int apply_channel_normalization_in_swicms(swicms_norm_info *swicms,
475 imeldata* oframe,
476 imeldata* iframe, int dimen)
477 {
478 int ii;
479 ASSERT(dimen == MAX_CHAN_DIM);
480
481 /* IF inutt is activated at all */
482 if(swicms->inutt.forget_factor2 != SWICMS_INUTT_FORGET_FACTOR2_DISABLE) {
483 /* AND IF we have not disabled it (due to x-utt more reliable) */
484 if(swicms->inutt.num_frames_in_accum < swicms->inutt.disable_after) {
485 /* AND IF we have skipped past the silence frames */
486 if( swicms->inutt.num_frames_since_bou >= swicms->inutt.num_bou_frames_to_skip){
487 swicms->inutt.num_frames_in_accum++;
488 for(ii=0;ii<dimen;ii++) swicms->inutt.accum[ii] += iframe[ii];
489 /* AND IF we've already seen at least 10 frames (presumably) of speech */
490 if(swicms->inutt.num_frames_in_accum>swicms->inutt.enable_after) {
491 /* THEN we update the adjustment in-line with the current utterance! */
492 for(ii=0;ii<dimen;ii++) {
493 imeldata denom = ( swicms->inutt.forget_factor2
494 + swicms->inutt.num_frames_in_accum );
495 /* tmp: weighted average of the old lda_cmn and the new accum */
496 imeldata tmp=(swicms->lda_cmn[ii]*swicms->inutt.forget_factor2
497 + swicms->inutt.accum[ii] + denom/2) / denom;
498 swicms->adjust[ii] = swicms->lda_tmn[ii] - tmp;
499 }
500 //printf_vector("swicms->adjust2 "," %d",swicms->adjust, dimen);
501 }
502 }
503 }
504 swicms->inutt.num_frames_since_bou++;
505 }
506
507 for (ii = 0; ii < dimen; ii++)
508 oframe[ii] = MAKEBYTE(iframe[ii] + swicms->adjust[ii]);
509 return 0;
510 }
511
swicms_update(swicms_norm_info * swicms,int speech_start,int speech_end)512 int swicms_update(swicms_norm_info* swicms, int speech_start, int speech_end)
513 {
514 int i, j;
515 asr_int32_t speech_avg[MAX_CHAN_DIM], backgr_avg[MAX_CHAN_DIM], avg[MAX_CHAN_DIM];
516 int ff;
517 int nn, speech_nn, backgr_nn;
518 int num_frames = swicms->cached_num_frames;
519 int cache_start, cache_end, backgr_cache_end;
520 int sbindex = swicms->sbindex;
521
522 /* init for utterance */
523 swicms->inutt.num_frames_since_bou = 0;
524
525 swicms->cached_num_frames = 0;
526 cache_start = speech_start;
527 cache_start -= (cache_start % swicms->cache_resolution);
528 cache_start /= swicms->cache_resolution;
529
530 if (speech_end == MAXframeID)
531 {
532 cache_end = SWICMS_CACHE_SIZE_DEFAULT;
533 }
534 else
535 {
536 if (speech_end < num_frames)
537 cache_end = speech_end;
538 else
539 cache_end = num_frames;
540 cache_end -= (cache_end % swicms->cache_resolution);
541 cache_end /= swicms->cache_resolution;
542 }
543
544 if (num_frames == 0 || speech_end == 0 || speech_start == speech_end || speech_end == MAXframeID)
545 {
546 if (speech_end != 0 || speech_start != 0)
547 PLogError("Warning: speech_bounds (%d,%d) swicms->cached_num_frames (%d)\n",
548 speech_start, speech_end, num_frames);
549 if (SWICMS_DEBUG) {
550 //printf_vector("swicms->adjust.rep", " %d", swicms->adjust, MAX_CHAN_DIM);
551 }
552 return 1;
553 }
554
555 backgr_cache_end = (num_frames - num_frames % swicms->cache_resolution) / swicms->cache_resolution;
556
557 speech_nn = (cache_end - cache_start) * swicms->cache_resolution;
558 backgr_nn = backgr_cache_end * swicms->cache_resolution - speech_nn;
559
560 for (i = 0; i < MAX_CHAN_DIM; i++)
561 {
562 speech_avg[i] = 0;
563 backgr_avg[i] = 0;
564 for (j = cache_start; j < cache_end; j++)
565 speech_avg[i] += swicms->cached_sections[j][i];
566 for (j = 0; j < cache_start; j++)
567 backgr_avg[i] += swicms->cached_sections[j][i];
568 for (j = cache_end; j < backgr_cache_end; j++)
569 backgr_avg[i] += swicms->cached_sections[j][i];
570 if (speech_nn == 0 && backgr_nn > 0)
571 {
572 backgr_avg[i] /= backgr_nn;
573 speech_avg[i] = backgr_avg[i];
574 speech_nn = backgr_nn;
575 }
576 else if (speech_nn > 0 && backgr_nn == 0)
577 {
578 speech_avg[i] /= speech_nn;
579 backgr_avg[i] = speech_avg[i];
580 backgr_nn = speech_nn;
581 }
582 else if (speech_nn > 0 && backgr_nn > 0)
583 {
584 speech_avg[i] /= speech_nn;
585 backgr_avg[i] /= backgr_nn;
586 }
587 else
588 {
589 return 0;
590 }
591
592 avg[i] = (sbindex * speech_avg[i] + (100 - sbindex) * backgr_avg[i] + 50) / 100;
593 }
594 nn = (sbindex * speech_nn + (100 - sbindex) * backgr_nn + 50) / 100;
595
596 for (i = 0, ff = 0; i < MAX_CHAN_DIM; i++)
597 {
598 ff += (swicms->lda_tmn[i] - avg[i]);
599 }
600 ff /= MAX_CHAN_DIM; /* sum is now the average offset from TMN */
601 if (ff > 5)
602 {
603 PLogError("Warning: bad utt mean during swicms_update() (moffs=%d)\n", ff);
604 //printf_vector("swicms->adjust.rep", " %d", swicms->adjust, MAX_CHAN_DIM);
605 return 1;
606 }
607 ff = swicms->forget_factor;
608 if (ff < 9999)
609 {
610 for (i = 0; i < MAX_CHAN_DIM; i++)
611 {
612 swicms->lda_cmn[i] = (swicms->lda_cmn[i] * ff + avg[i] * nn + (ff + nn) / 2) / (ff + nn);
613 swicms->adjust[i] = swicms->lda_tmn[i] - swicms->lda_cmn[i];
614 }
615 }
616
617 if (SWICMS_DEBUG)
618 {
619 imeldata temp[MAX_CHAN_DIM];
620 PLogMessage("swicms_update() used %d frames (%d-%d)", nn, speech_start, speech_end);
621
622 for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_cmn[i];
623 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/);
624 /* use this dump, to put back into CREC.Frontend.swicms.cmn */
625 printf_vector("swicms.cmn(r) ", " %d", temp, MAX_CHAN_DIM);
626
627 //printf_vector("swicms.lda_cmn ", " %d", &swicms.lda_cmn [0], MAX_CHAN_DIM);
628 //printf_vector("swicms.lda_tmn ", " %d", &swicms.lda_tmn [0], MAX_CHAN_DIM);
629 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM);
630 //printf_vector("avg.speech ", " %d", avg, MAX_CHAN_DIM);
631 }
632 else
633 {
634 #ifndef NDEBUG
635 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM);
636 #endif
637 }
638 swicms->num_frames_in_cmn += nn;
639 return 0;
640 }
641
swicms_lda_process(swicms_norm_info * swicms,preprocessed * prep)642 int swicms_lda_process(swicms_norm_info* swicms, preprocessed* prep)
643 {
644 int i;
645
646 for (i = 0; i < MAX_CHAN_DIM; i++) swicms->lda_tmn[i] = swicms->tmn[i];
647 for (i = 0; i < MAX_CHAN_DIM; i++) swicms->lda_cmn[i] = swicms->cmn[i];
648 linear_transform_frame(prep, swicms->lda_tmn, 1 /*do_shift*/);
649 linear_transform_frame(prep, swicms->lda_cmn, 1 /*do_shift*/);
650
651 for (i = 0; i < MAX_CHAN_DIM; i++)
652 {
653 swicms->adjust[i] = swicms->lda_tmn[i] - swicms->lda_cmn[i];
654 }
655
656 #ifndef NDEBUG
657 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM);
658 #endif
659 swicms->is_valid = 1;
660 swicms->_prep = prep;
661
662 if(SWICMS_DEBUG) {
663 imeldata temp[MAX_CHAN_DIM];
664 printf_vector("swicms->cmn ", " %d", swicms->cmn, MAX_CHAN_DIM);
665 printf_vector("swicms->lda_cmn ", " %d", swicms->lda_cmn, MAX_CHAN_DIM);
666 //printf_vector("swicms->tmn ", " %d", swicms->tmn, MAX_CHAN_DIM);
667 //printf_vector("swicms->lda_tmn ", " %d", swicms->lda_tmn, MAX_CHAN_DIM);
668 //printf_vector("swicms->adjust ", " %d", swicms->adjust, MAX_CHAN_DIM);
669
670 //for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_tmn[i];
671 //inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/);
672 //printf_vector("swicms->tmn(r) ", " %d", temp, MAX_CHAN_DIM);
673
674 for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_cmn[i];
675 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/);
676 printf_vector("swicms->cmn(r) ", " %d", temp, MAX_CHAN_DIM);
677 }
678 return 0;
679 }
680
681
682
683