1 /*---------------------------------------------------------------------------*
2 * swicms.c *
3 * *
4 * Copyright 2007, 2008 Nuance Communciations, Inc. *
5 * *
6 * Licensed under the Apache License, Version 2.0 (the 'License'); *
7 * you may not use this file except in compliance with the License. *
8 * *
9 * You may obtain a copy of the License at *
10 * http://www.apache.org/licenses/LICENSE-2.0 *
11 * *
12 * Unless required by applicable law or agreed to in writing, software *
13 * distributed under the License is distributed on an 'AS IS' BASIS, *
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 * See the License for the specific language governing permissions and *
16 * limitations under the License. *
17 * *
18 *---------------------------------------------------------------------------*/
19
20 #include <string.h>
21 #include"swicms.h"
22 #include"srec_sizes.h"
23 #include"prelib.h"
24
25 #include "passert.h"
26 #include "ESR_Session.h"
27 #include "ESR_SessionType.h"
28 #include "IntArrayList.h"
29 #include "portable.h"
30
31 #define printf_vector(HEAD, FMT, PTR, NN) { int i; LCHAR buffer[256]; sprintf(buffer, HEAD); sprintf(buffer + LSTRLEN(buffer), " %p", (void *)PTR); for (i=0; i<(NN); ++i) sprintf(buffer + LSTRLEN(buffer), FMT, PTR[i]); PLogMessage(buffer); }
32
33 /* Cross-utterance CMN calculation:
34 We try to normalize the speech frames before they get to the recognizer.
35 The speech frames are LDA-processed mfcc-with-dynamic feature vectors.
36 We collect these speech frames during recognition. At the end of
37 recognition we exclude the silence frames from the collected data, and
38 generate a new channel average based on the previous average and the new
39 data, using an exponential decay formula.
40
41 In-utterance CMN calculation:
42 A new short-term average mechanism was introduced, with faster update,
43 to improve recognition on the very first recognition after init or reset.
44 We wait for a minimum number of new data frames to apply this. We also
45 disable the fast updater after some frames, because we assume the
46 cross-utterance estimator to be more reliable, particularly in its
47 ability to exclude silence frames from the calculation.
48 */
49
50 /* default settings for cross-utterance cms */
51 #define SWICMS_FORGET_FACTOR_DEFAULT 400 /* effective frms of history */
52 #define SWICMS_SBINDEX_DEFAULT 100 /* use speech frames only */
53 /* #define SWICMS_CACHE_RESOLUTION_DEFAULT see swicms.h */
54 /* #define SWICMS_CACHE_SIZE_DEFAULT see swicms.h */
55
56 /* default settings for in-utterance cms */
57 #define SWICMS_INUTT_FORGET_FACTOR2_DISABLE 65535 /* any large number */
58 #define SWICMS_INUTT_FORGET_FACTOR2_DEFAULT SWICMS_INUTT_FORGET_FACTOR2_DISABLE
59 /* disable this when cross-utt become more reliable */
60 #define SWICMS_INUTT_DISABLE_AFTER_FRAMES 200
61 /* wait while the estimate is poor */
62 #define SWICMS_INUTT_ENABLE_AFTER_FRAMES 10
63
64 /**
65 * Logging Stuff
66 */
67 #define LOG_LEVEL 2
68 #define MODULE_NAME L("swicms.c")
69 //static const char* MTAG = MODULE_NAME;
70
71 static const char *rcsid = 0 ? (const char *) &rcsid :
72 "$Id: swicms.c,v 1.21.6.16 2008/06/05 19:00:55 stever Exp $";
73
74 static ESR_BOOL SWICMS_DEBUG = ESR_FALSE;
75
76 /* these are good values from cmn/tmn files */
77 static const imeldata gswicms_cmn1_8 [MAX_CHAN_DIM] =
78 {
79 158, 141, 99, 125, 101, 162, 113, 138, 128, 143, 123, 141,
80 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
81 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127
82 };
83
84 static const imeldata gswicms_cmn1_11 [MAX_CHAN_DIM] =
85 {
86 163, 121, 120, 114, 124, 139, 144, 108, 150, 119, 146, 124,
87 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
88 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127
89 };
90
91 static const imeldata gswicms_tmn1_8 [MAX_CHAN_DIM] =
92 {
93 108, 138, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
94 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
95 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127
96 };
97
98 static const imeldata gswicms_tmn1_11 [MAX_CHAN_DIM] =
99 {
100 108, 138, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
101 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
102 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127, 127
103 };
104
GetSomeIntsIfAny(const LCHAR * parname,imeldata * parvalue,size_t reqSize)105 static ESR_ReturnCode GetSomeIntsIfAny( const LCHAR* parname, imeldata* parvalue, size_t reqSize)
106 {
107 size_t i, size;
108 ESR_ReturnCode rc;
109 ESR_BOOL exists;
110 IntArrayList* intList = 0;
111
112 CHKLOG(rc, ESR_SessionContains(parname, &exists));
113 if (exists) {
114 rc = ESR_SessionGetProperty(parname, (void**)&intList, TYPES_INTARRAYLIST);
115 if (rc != ESR_SUCCESS && rc != ESR_NO_MATCH_ERROR) {
116 /* no match will revert to default data already in static array */
117 PLogError(L("Error reading %s from session: %s"), parname, ESR_rc2str(rc));
118 return ESR_FATAL_ERROR;
119 }
120 else if (rc == ESR_SUCCESS) {
121 CHKLOG(rc, IntArrayListGetSize(intList, &size));
122 if(size != reqSize) {
123 PLogError(L("Error reading %s from session, expected len %d: %s"), parname, reqSize, ESR_rc2str(rc));
124 return ESR_FATAL_ERROR;
125 }
126 if(reqSize == 1)
127 CHKLOG(rc, IntArrayListGet(intList, 0, parvalue));
128 else {
129 for (i=0; i<size; ++i)
130 CHKLOG(rc, IntArrayListGet(intList, i, &parvalue[i]));
131 }
132 }
133 }
134 return ESR_SUCCESS;
135 CLEANUP:
136 return rc;
137 }
138
swicms_init(swicms_norm_info * swicms)139 int swicms_init(swicms_norm_info* swicms)
140 {
141 ESR_ReturnCode rc = ESR_SUCCESS;
142 size_t i;
143 ESR_BOOL exists, sessionExists;
144 size_t sample_rate;
145
146 /* defaults */
147 swicms->sbindex = SWICMS_SBINDEX_DEFAULT;
148 swicms->cached_num_frames = 0;
149 swicms->forget_factor = SWICMS_FORGET_FACTOR_DEFAULT;
150 swicms->cache_resolution = SWICMS_CACHE_RESOLUTION_DEFAULT;
151 swicms->num_frames_in_cmn = 0;
152
153 CHKLOG(rc, ESR_SessionExists(&sessionExists));
154
155 if (sessionExists)
156 { /* We'll assume this rate is valid or someone else will be complaining. SteveR */
157 rc = ESR_SessionGetSize_t ( L ( "CREC.Frontend.samplerate" ), &sample_rate );
158
159 if ( rc != ESR_SUCCESS )
160 return ( rc );
161 }
162 else
163 sample_rate = 11025;
164
165 /* init the data structures by copying the static data so that we can have a copy if we need to reset */
166 if ( sample_rate == 8000 )
167 {
168 for ( i = 0; i < MAX_CHAN_DIM; i++ )
169 {
170 swicms->cmn [i] = gswicms_cmn1_8 [i];
171 swicms->tmn [i] = gswicms_tmn1_8 [i];
172 // _lda_*mn below are OK, but are recalculated in swicms_lda_process()
173 swicms->lda_cmn [i] = 0; /* calculated by swicms_lda_process() */
174 swicms->lda_tmn [i] = 0; /* calculated by swicms_lda_process() */
175 }
176 }
177 else
178 {
179 for ( i = 0; i < MAX_CHAN_DIM; i++ )
180 {
181 swicms->cmn [i] = gswicms_cmn1_11 [i];
182 swicms->tmn [i] = gswicms_tmn1_11 [i];
183 // _lda_*mn below are OK, but are recalculated in swicms_lda_process()
184 swicms->lda_cmn [i] = 0; /* calculated by swicms_lda_process() */
185 swicms->lda_tmn [i] = 0; /* calculated by swicms_lda_process() */
186 }
187 }
188 CHKLOG(rc, ESR_SessionExists(&sessionExists));
189
190 if (sessionExists)
191 {
192 const LCHAR* parname = L("CREC.Frontend.swicms.debug");
193 CHKLOG(rc, ESR_SessionContains(parname, &exists));
194 if (exists) {
195 rc = ESR_SessionGetBool(parname, &SWICMS_DEBUG);
196 if (rc != ESR_SUCCESS && rc != ESR_NO_MATCH_ERROR) {
197 PLOG_DBG_ERROR((L("Error reading %s from session: %s"), parname, ESR_rc2str(rc)));
198 return rc;
199 }
200 }
201
202 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.forget_factor"),
203 &swicms->forget_factor, 1);
204 if(rc != ESR_SUCCESS) return rc;
205
206 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.sbindex"),
207 &swicms->sbindex, 1);
208 if(rc != ESR_SUCCESS) return rc;
209
210 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn"),
211 &swicms->cmn[0], MAX_CHAN_DIM);
212 if(rc != ESR_SUCCESS) return rc;
213
214 if ( sample_rate == 8000 )
215 {
216 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn8"), &swicms->cmn[0], MAX_CHAN_DIM);
217
218 if(rc != ESR_SUCCESS)
219 return rc;
220 }
221 else
222 {
223 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.cmn11"), &swicms->cmn[0], MAX_CHAN_DIM);
224
225 if(rc != ESR_SUCCESS)
226 return rc;
227 }
228
229 rc = GetSomeIntsIfAny( L("CREC.Frontend.swicms.tmn"),
230 &swicms->tmn[0], MAX_CHAN_DIM);
231 if(rc != ESR_SUCCESS) return rc;
232 }
233
234 swicms->is_valid = 0;
235 for (i = 0; i < MAX_CHAN_DIM; i++)
236 swicms->adjust[i] = 255;
237
238 #ifdef SREC_ENGINE_VERBOSE_LOGGING
239 PLogMessage("swicms->forget_factor = %d\n", swicms->forget_factor);
240 PLogMessage("swicms->cache_resolution = %d\n", swicms->cache_resolution);
241 PLogMessage("swicms->sbindex = %d\n", swicms->sbindex);
242 #endif
243
244 /* in-utt cms parameters */
245 swicms->inutt.forget_factor2 = SWICMS_INUTT_FORGET_FACTOR2_DEFAULT;
246 swicms->inutt.disable_after = 200;
247 swicms->inutt.enable_after = 10; /* in-utt is less reliable */
248 swicms->inutt.num_bou_frames_to_skip = 20; /* silence frames! see windback */
249 swicms->inutt.num_frames_since_bou = 0;
250 swicms->inutt.num_frames_in_accum = 0;
251 for(i=0; i<MAX_CHAN_DIM; i++) swicms->inutt.accum[i] = 0;
252
253 if (sessionExists) {
254 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.forget_factor2"),
255 &swicms->inutt.forget_factor2, 1);
256 if(rc != ESR_SUCCESS) return rc;
257
258 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.disable_after"),
259 &swicms->inutt.disable_after, 1);
260 if(rc != ESR_SUCCESS) return rc;
261
262 rc = GetSomeIntsIfAny(L("CREC.Frontend.swicms.inutt.enable_after"),
263 &swicms->inutt.enable_after, 1);
264 if(rc != ESR_SUCCESS) return rc;
265
266 /* we need to estimate the in-utt cmn from speech frames only! so let's
267 make sure to skip some frames before collecting data, */
268 ESR_SessionContains(L("CREC.Frontend.start_windback"), &exists);
269 if (exists) {
270 ESR_BOOL do_skip_even_frames = ESR_TRUE;
271 ESR_SessionGetBool(L("CREC.Frontend.do_skip_even_frames"), &do_skip_even_frames);
272 ESR_SessionGetInt(L("CREC.Frontend.start_windback"), &swicms->inutt.num_bou_frames_to_skip);
273 if( do_skip_even_frames)
274 swicms->inutt.num_bou_frames_to_skip /= 2;
275 swicms->inutt.num_bou_frames_to_skip -= 5; /* ensure spch frames only */
276 }
277 }
278
279 return 0;
280 CLEANUP:
281 return rc;
282 }
283
284
swicms_get_cmn(swicms_norm_info * swicms,LCHAR * cmn_params,size_t * len)285 ESR_ReturnCode swicms_get_cmn ( swicms_norm_info* swicms, LCHAR *cmn_params, size_t* len )
286 {
287 int dim_count;
288 int i;
289 imeldata temp[MAX_CHAN_DIM];
290 const size_t INT_LENGTH = 12;
291
292 if ( swicms->_prep != NULL ) /* lda exists give them transformed lda. */
293 {
294 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ )
295 temp [dim_count] = swicms->lda_cmn [dim_count];
296 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/);
297 }
298 else /* lda does not exist give them raw cmn values */
299 {
300 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ )
301 temp [dim_count] = swicms->cmn [dim_count];
302 }
303
304 for ( dim_count = 0, i = 0; dim_count < MAX_CHAN_DIM; dim_count++ )
305 {
306 i += sprintf( cmn_params + i, dim_count==0 ? "%d" : ",%d", temp [dim_count] );
307 if (i + INT_LENGTH >= *len) {
308 *len = MAX_CHAN_DIM * (INT_LENGTH + 2) * sizeof(LCHAR);
309 return ESR_BUFFER_OVERFLOW;
310 }
311 }
312
313 return ESR_SUCCESS;
314 }
315
316
swicms_set_cmn(swicms_norm_info * swicms,const char * cmn_params)317 ESR_ReturnCode swicms_set_cmn ( swicms_norm_info* swicms, const char *cmn_params )
318 {
319 ESR_ReturnCode set_status;
320 int length_of_params;
321 int dim_count;
322 int got_word;
323 int current_position;
324 char *copy_of_params;
325 char *parsed_strings [MAX_CHAN_DIM];
326 int temp_cmn [MAX_CHAN_DIM];
327
328 length_of_params = strlen ( cmn_params ) + 1;
329 copy_of_params = (char*)MALLOC ( length_of_params, NULL );
330
331 if ( copy_of_params != NULL )
332 {
333 set_status = ESR_SUCCESS;
334 memcpy ( copy_of_params, cmn_params, length_of_params );
335 dim_count = 0;
336 current_position = 0;
337 got_word = 0;
338 parsed_strings [dim_count] = copy_of_params + current_position;
339
340 while ( ( dim_count < MAX_CHAN_DIM ) && ( set_status == ESR_SUCCESS ) )
341 {
342 switch ( *( copy_of_params + current_position ) )
343 {
344 case '\0':
345 if ( got_word == 1 )
346 {
347 if ( dim_count == ( MAX_CHAN_DIM - 1 ) )
348 dim_count++;
349 else
350 {
351 PLogError ( "Channel Normalization : Missing Params Must Contain %d Params\n", MAX_CHAN_DIM );
352 set_status = ESR_INVALID_ARGUMENT;
353 }
354 }
355 else
356 {
357 PLogError ( "Channel Normalization : Missing Params Mus Contain %d Params\n", MAX_CHAN_DIM );
358 set_status = ESR_INVALID_ARGUMENT;
359 }
360 break;
361
362 case ',':
363 if ( got_word == 1 )
364 {
365 if ( dim_count < ( MAX_CHAN_DIM - 1 ) )
366 {
367 dim_count++;
368 *( copy_of_params + current_position) = '\0';
369 current_position++;
370
371 if ( current_position == length_of_params )
372 {
373 PLogError ( "Channel Normalization : Delimiter At End Of Param String\n" );
374 set_status = ESR_INVALID_ARGUMENT;
375 }
376 parsed_strings [dim_count] = copy_of_params + current_position;
377 got_word = 0;
378 }
379 else
380 {
381 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM );
382 set_status = ESR_INVALID_ARGUMENT;
383 }
384 }
385 else
386 {
387 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM );
388 set_status = ESR_INVALID_ARGUMENT;
389 }
390 break;
391
392 case '0':
393 case '1':
394 case '2':
395 case '3':
396 case '4':
397 case '5':
398 case '6':
399 case '7':
400 case '8':
401 case '9':
402 got_word = 1;
403 current_position++;
404
405 if ( current_position == length_of_params )
406 {
407 PLogError ( "Channel Normalization : Too Many Params Must Contain %d Params\n", MAX_CHAN_DIM );
408 set_status = ESR_INVALID_ARGUMENT;
409 }
410 break;
411
412 default:
413 PLogError ( "Channel Normalization : Invalid Param : %c : Params Must Contain Only Digits\n" );
414 set_status = ESR_INVALID_ARGUMENT;
415 break;
416 }
417 }
418 if ( set_status == ESR_SUCCESS )
419 {
420 dim_count = 0;
421
422 while ( ( dim_count < MAX_CHAN_DIM ) && ( set_status == ESR_SUCCESS ) )
423 {
424 temp_cmn [dim_count] = atoi ( parsed_strings [dim_count] );
425
426 if ( ( temp_cmn [dim_count] < 0 ) || ( temp_cmn [dim_count] > 255 ) )
427 {
428 set_status = ESR_INVALID_ARGUMENT;
429 }
430
431 dim_count++;
432 }
433 if ( set_status == ESR_SUCCESS )
434 {
435 for ( dim_count = 0; dim_count < MAX_CHAN_DIM; dim_count++ )
436 swicms->cmn [dim_count] = temp_cmn [dim_count];
437 if ( swicms->_prep != NULL ) /* Set now if NULL it will automatically be set on first utterance */
438 linear_transform_frame(swicms->_prep, swicms->lda_cmn, 1 /*do_shift*/);
439 }
440 }
441 FREE ( copy_of_params );
442 }
443 else
444 {
445 PLogError ( "Channel Normalization Out Of Memory Error\n" );
446 set_status = ESR_OUT_OF_MEMORY;
447 }
448 swicms->num_frames_in_cmn = 0;
449 return ( set_status );
450 }
451
452
swicms_cache_frame(swicms_norm_info * swicms,imeldata * frame,int dimen)453 int swicms_cache_frame(swicms_norm_info* swicms, imeldata* frame, int dimen)
454 {
455 int i;
456 imeldata *pcache, *pframe;
457
458 ASSERT(dimen == MAX_CHAN_DIM);
459 i = swicms->cached_num_frames / swicms->cache_resolution;
460 if (i < SWICMS_CACHE_SIZE_DEFAULT)
461 {
462 pcache = swicms->cached_sections[ i];
463 if (swicms->cached_num_frames % swicms->cache_resolution == 0)
464 {
465 for (i = 0; i < MAX_CHAN_DIM; i++) *pcache++ = 0;
466 pcache -= MAX_CHAN_DIM;
467 }
468 pframe = frame;
469 for (i = 0; i < MAX_CHAN_DIM; i++) *pcache++ += *pframe++;
470 swicms->cached_num_frames++;
471 }
472
473 return 0;
474 }
475
apply_channel_normalization_in_swicms(swicms_norm_info * swicms,imeldata * oframe,imeldata * iframe,int dimen)476 int apply_channel_normalization_in_swicms(swicms_norm_info *swicms,
477 imeldata* oframe,
478 imeldata* iframe, int dimen)
479 {
480 int ii;
481 ASSERT(dimen == MAX_CHAN_DIM);
482
483 /* IF inutt is activated at all */
484 if(swicms->inutt.forget_factor2 != SWICMS_INUTT_FORGET_FACTOR2_DISABLE) {
485 /* AND IF we have not disabled it (due to x-utt more reliable) */
486 if(swicms->inutt.num_frames_in_accum < swicms->inutt.disable_after) {
487 /* AND IF we have skipped past the silence frames */
488 if( swicms->inutt.num_frames_since_bou >= swicms->inutt.num_bou_frames_to_skip){
489 swicms->inutt.num_frames_in_accum++;
490 for(ii=0;ii<dimen;ii++) swicms->inutt.accum[ii] += iframe[ii];
491 /* AND IF we've already seen at least 10 frames (presumably) of speech */
492 if(swicms->inutt.num_frames_in_accum>swicms->inutt.enable_after) {
493 /* THEN we update the adjustment in-line with the current utterance! */
494 for(ii=0;ii<dimen;ii++) {
495 imeldata denom = ( swicms->inutt.forget_factor2
496 + swicms->inutt.num_frames_in_accum );
497 /* tmp: weighted average of the old lda_cmn and the new accum */
498 imeldata tmp=(swicms->lda_cmn[ii]*swicms->inutt.forget_factor2
499 + swicms->inutt.accum[ii] + denom/2) / denom;
500 swicms->adjust[ii] = swicms->lda_tmn[ii] - tmp;
501 }
502 //printf_vector("swicms->adjust2 "," %d",swicms->adjust, dimen);
503 }
504 }
505 }
506 swicms->inutt.num_frames_since_bou++;
507 }
508
509 for (ii = 0; ii < dimen; ii++)
510 oframe[ii] = MAKEBYTE(iframe[ii] + swicms->adjust[ii]);
511 return 0;
512 }
513
swicms_update(swicms_norm_info * swicms,int speech_start,int speech_end)514 int swicms_update(swicms_norm_info* swicms, int speech_start, int speech_end)
515 {
516 int i, j;
517 asr_int32_t speech_avg[MAX_CHAN_DIM], backgr_avg[MAX_CHAN_DIM], avg[MAX_CHAN_DIM];
518 int ff;
519 int nn, speech_nn, backgr_nn;
520 int num_frames = swicms->cached_num_frames;
521 int cache_start, cache_end, backgr_cache_end;
522 int sbindex = swicms->sbindex;
523
524 /* init for utterance */
525 swicms->inutt.num_frames_since_bou = 0;
526
527 swicms->cached_num_frames = 0;
528 cache_start = speech_start;
529 cache_start -= (cache_start % swicms->cache_resolution);
530 cache_start /= swicms->cache_resolution;
531
532 if (speech_end == MAXframeID)
533 {
534 cache_end = SWICMS_CACHE_SIZE_DEFAULT;
535 }
536 else
537 {
538 if (speech_end < num_frames)
539 cache_end = speech_end;
540 else
541 cache_end = num_frames;
542 cache_end -= (cache_end % swicms->cache_resolution);
543 cache_end /= swicms->cache_resolution;
544 }
545
546 if (num_frames == 0 || speech_end == 0 || speech_start == speech_end || speech_end == MAXframeID)
547 {
548 if (speech_end != 0 || speech_start != 0)
549 PLogError("Warning: speech_bounds (%d,%d) swicms->cached_num_frames (%d)\n",
550 speech_start, speech_end, num_frames);
551 if (SWICMS_DEBUG) {
552 //printf_vector("swicms->adjust.rep", " %d", swicms->adjust, MAX_CHAN_DIM);
553 }
554 return 1;
555 }
556
557 backgr_cache_end = (num_frames - num_frames % swicms->cache_resolution) / swicms->cache_resolution;
558
559 speech_nn = (cache_end - cache_start) * swicms->cache_resolution;
560 backgr_nn = backgr_cache_end * swicms->cache_resolution - speech_nn;
561
562 for (i = 0; i < MAX_CHAN_DIM; i++)
563 {
564 speech_avg[i] = 0;
565 backgr_avg[i] = 0;
566 for (j = cache_start; j < cache_end; j++)
567 speech_avg[i] += swicms->cached_sections[j][i];
568 for (j = 0; j < cache_start; j++)
569 backgr_avg[i] += swicms->cached_sections[j][i];
570 for (j = cache_end; j < backgr_cache_end; j++)
571 backgr_avg[i] += swicms->cached_sections[j][i];
572 if (speech_nn == 0 && backgr_nn > 0)
573 {
574 backgr_avg[i] /= backgr_nn;
575 speech_avg[i] = backgr_avg[i];
576 speech_nn = backgr_nn;
577 }
578 else if (speech_nn > 0 && backgr_nn == 0)
579 {
580 speech_avg[i] /= speech_nn;
581 backgr_avg[i] = speech_avg[i];
582 backgr_nn = speech_nn;
583 }
584 else if (speech_nn > 0 && backgr_nn > 0)
585 {
586 speech_avg[i] /= speech_nn;
587 backgr_avg[i] /= backgr_nn;
588 }
589 else
590 {
591 return 0;
592 }
593
594 avg[i] = (sbindex * speech_avg[i] + (100 - sbindex) * backgr_avg[i] + 50) / 100;
595 }
596 nn = (sbindex * speech_nn + (100 - sbindex) * backgr_nn + 50) / 100;
597
598 for (i = 0, ff = 0; i < MAX_CHAN_DIM; i++)
599 {
600 ff += (swicms->lda_tmn[i] - avg[i]);
601 }
602 ff /= MAX_CHAN_DIM; /* sum is now the average offset from TMN */
603 if (ff > 5)
604 {
605 PLogError("Warning: bad utt mean during swicms_update() (moffs=%d)\n", ff);
606 //printf_vector("swicms->adjust.rep", " %d", swicms->adjust, MAX_CHAN_DIM);
607 return 1;
608 }
609 ff = swicms->forget_factor;
610 if (ff < 9999)
611 {
612 for (i = 0; i < MAX_CHAN_DIM; i++)
613 {
614 swicms->lda_cmn[i] = (swicms->lda_cmn[i] * ff + avg[i] * nn + (ff + nn) / 2) / (ff + nn);
615 swicms->adjust[i] = swicms->lda_tmn[i] - swicms->lda_cmn[i];
616 }
617 }
618
619 if (SWICMS_DEBUG)
620 {
621 imeldata temp[MAX_CHAN_DIM];
622 PLogMessage("swicms_update() used %d frames (%d-%d)", nn, speech_start, speech_end);
623
624 for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_cmn[i];
625 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/);
626 /* use this dump, to put back into CREC.Frontend.swicms.cmn */
627 printf_vector("swicms.cmn(r) ", " %d", temp, MAX_CHAN_DIM);
628
629 //printf_vector("swicms.lda_cmn ", " %d", &swicms.lda_cmn [0], MAX_CHAN_DIM);
630 //printf_vector("swicms.lda_tmn ", " %d", &swicms.lda_tmn [0], MAX_CHAN_DIM);
631 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM);
632 //printf_vector("avg.speech ", " %d", avg, MAX_CHAN_DIM);
633 }
634 else
635 {
636 #ifndef NDEBUG
637 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM);
638 #endif
639 }
640 swicms->num_frames_in_cmn += nn;
641 return 0;
642 }
643
swicms_lda_process(swicms_norm_info * swicms,preprocessed * prep)644 int swicms_lda_process(swicms_norm_info* swicms, preprocessed* prep)
645 {
646 int i;
647
648 for (i = 0; i < MAX_CHAN_DIM; i++) swicms->lda_tmn[i] = swicms->tmn[i];
649 for (i = 0; i < MAX_CHAN_DIM; i++) swicms->lda_cmn[i] = swicms->cmn[i];
650 linear_transform_frame(prep, swicms->lda_tmn, 1 /*do_shift*/);
651 linear_transform_frame(prep, swicms->lda_cmn, 1 /*do_shift*/);
652
653 for (i = 0; i < MAX_CHAN_DIM; i++)
654 {
655 swicms->adjust[i] = swicms->lda_tmn[i] - swicms->lda_cmn[i];
656 }
657
658 #ifndef NDEBUG
659 //printf_vector("swicms->adjust", " %d", swicms->adjust, MAX_CHAN_DIM);
660 #endif
661 swicms->is_valid = 1;
662 swicms->_prep = prep;
663
664 if(SWICMS_DEBUG) {
665 imeldata temp[MAX_CHAN_DIM];
666 printf_vector("swicms->cmn ", " %d", swicms->cmn, MAX_CHAN_DIM);
667 printf_vector("swicms->lda_cmn ", " %d", swicms->lda_cmn, MAX_CHAN_DIM);
668 //printf_vector("swicms->tmn ", " %d", swicms->tmn, MAX_CHAN_DIM);
669 //printf_vector("swicms->lda_tmn ", " %d", swicms->lda_tmn, MAX_CHAN_DIM);
670 //printf_vector("swicms->adjust ", " %d", swicms->adjust, MAX_CHAN_DIM);
671
672 //for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_tmn[i];
673 //inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/);
674 //printf_vector("swicms->tmn(r) ", " %d", temp, MAX_CHAN_DIM);
675
676 for(i=0;i<MAX_CHAN_DIM;i++) temp[i]=swicms->lda_cmn[i];
677 inverse_transform_frame( swicms->_prep, temp, 1 /*do_shift*/);
678 printf_vector("swicms->cmn(r) ", " %d", temp, MAX_CHAN_DIM);
679 }
680 return 0;
681 }
682
683
684
685