1 /* Copyright (C) 2003 Epic Games (written by Jean-Marc Valin)
2 Copyright (C) 2004-2006 Epic Games
3
4 File: preprocess.c
5 Preprocessor with denoising based on the algorithm by Ephraim and Malah
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are
9 met:
10
11 1. Redistributions of source code must retain the above copyright notice,
12 this list of conditions and the following disclaimer.
13
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17
18 3. The name of the author may not be used to endorse or promote products
19 derived from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
25 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29 STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 POSSIBILITY OF SUCH DAMAGE.
32 */
33
34
35 /*
36 Recommended papers:
37
38 Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
39 short-time spectral amplitude estimator". IEEE Transactions on Acoustics,
40 Speech and Signal Processing, vol. ASSP-32, no. 6, pp. 1109-1121, 1984.
41
42 Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
43 log-spectral amplitude estimator". IEEE Transactions on Acoustics, Speech and
44 Signal Processing, vol. ASSP-33, no. 2, pp. 443-445, 1985.
45
46 I. Cohen and B. Berdugo, "Speech enhancement for non-stationary noise environments".
47 Signal Processing, vol. 81, no. 2, pp. 2403-2418, 2001.
48
49 Stefan Gustafsson, Rainer Martin, Peter Jax, and Peter Vary. "A psychoacoustic
50 approach to combined acoustic echo cancellation and noise reduction". IEEE
51 Transactions on Speech and Audio Processing, 2002.
52
53 J.-M. Valin, J. Rouat, and F. Michaud, "Microphone array post-filter for separation
54 of simultaneous non-stationary sources". In Proceedings IEEE International
55 Conference on Acoustics, Speech, and Signal Processing, 2004.
56 */
57
58 #ifdef HAVE_CONFIG_H
59 #include "config.h"
60 #endif
61
62 #include <math.h>
63 #include "speex/speex_preprocess.h"
64 #include "speex/speex_echo.h"
65 #include "arch.h"
66 #include "fftwrap.h"
67 #include "filterbank.h"
68 #include "math_approx.h"
69 #include "os_support.h"
70
71 #ifndef M_PI
72 #define M_PI 3.14159263
73 #endif
74
75 #define LOUDNESS_EXP 5.f
76 #define AMP_SCALE .001f
77 #define AMP_SCALE_1 1000.f
78
79 #define NB_BANDS 24
80
81 #define SPEECH_PROB_START_DEFAULT QCONST16(0.35f,15)
82 #define SPEECH_PROB_CONTINUE_DEFAULT QCONST16(0.20f,15)
83 #define NOISE_SUPPRESS_DEFAULT -15
84 #define ECHO_SUPPRESS_DEFAULT -40
85 #define ECHO_SUPPRESS_ACTIVE_DEFAULT -15
86
87 #ifndef NULL
88 #define NULL 0
89 #endif
90
91 #define SQR(x) ((x)*(x))
92 #define SQR16(x) (MULT16_16((x),(x)))
93 #define SQR16_Q15(x) (MULT16_16_Q15((x),(x)))
94
95 #ifdef FIXED_POINT
DIV32_16_Q8(spx_word32_t a,spx_word32_t b)96 static inline spx_word16_t DIV32_16_Q8(spx_word32_t a, spx_word32_t b)
97 {
98 if (SHR32(a,7) >= b)
99 {
100 return 32767;
101 } else {
102 if (b>=QCONST32(1,23))
103 {
104 a = SHR32(a,8);
105 b = SHR32(b,8);
106 }
107 if (b>=QCONST32(1,19))
108 {
109 a = SHR32(a,4);
110 b = SHR32(b,4);
111 }
112 if (b>=QCONST32(1,15))
113 {
114 a = SHR32(a,4);
115 b = SHR32(b,4);
116 }
117 a = SHL32(a,8);
118 return PDIV32_16(a,b);
119 }
120
121 }
DIV32_16_Q15(spx_word32_t a,spx_word32_t b)122 static inline spx_word16_t DIV32_16_Q15(spx_word32_t a, spx_word32_t b)
123 {
124 if (SHR32(a,15) >= b)
125 {
126 return 32767;
127 } else {
128 if (b>=QCONST32(1,23))
129 {
130 a = SHR32(a,8);
131 b = SHR32(b,8);
132 }
133 if (b>=QCONST32(1,19))
134 {
135 a = SHR32(a,4);
136 b = SHR32(b,4);
137 }
138 if (b>=QCONST32(1,15))
139 {
140 a = SHR32(a,4);
141 b = SHR32(b,4);
142 }
143 a = SHL32(a,15)-a;
144 return DIV32_16(a,b);
145 }
146 }
147 #define SNR_SCALING 256.f
148 #define SNR_SCALING_1 0.0039062f
149 #define SNR_SHIFT 8
150
151 #define FRAC_SCALING 32767.f
152 #define FRAC_SCALING_1 3.0518e-05
153 #define FRAC_SHIFT 1
154
155 #define EXPIN_SCALING 2048.f
156 #define EXPIN_SCALING_1 0.00048828f
157 #define EXPIN_SHIFT 11
158 #define EXPOUT_SCALING_1 1.5259e-05
159
160 #define NOISE_SHIFT 7
161
162 #else
163
164 #define DIV32_16_Q8(a,b) ((a)/(b))
165 #define DIV32_16_Q15(a,b) ((a)/(b))
166 #define SNR_SCALING 1.f
167 #define SNR_SCALING_1 1.f
168 #define SNR_SHIFT 0
169 #define FRAC_SCALING 1.f
170 #define FRAC_SCALING_1 1.f
171 #define FRAC_SHIFT 0
172 #define NOISE_SHIFT 0
173
174 #define EXPIN_SCALING 1.f
175 #define EXPIN_SCALING_1 1.f
176 #define EXPOUT_SCALING_1 1.f
177
178 #endif
179
180 /** Speex pre-processor state. */
181 struct SpeexPreprocessState_ {
182 /* Basic info */
183 int frame_size; /**< Number of samples processed each time */
184 int ps_size; /**< Number of points in the power spectrum */
185 int sampling_rate; /**< Sampling rate of the input/output */
186 int nbands;
187 FilterBank *bank;
188
189 /* Parameters */
190 int denoise_enabled;
191 int vad_enabled;
192 int dereverb_enabled;
193 spx_word16_t reverb_decay;
194 spx_word16_t reverb_level;
195 spx_word16_t speech_prob_start;
196 spx_word16_t speech_prob_continue;
197 int noise_suppress;
198 int echo_suppress;
199 int echo_suppress_active;
200 SpeexEchoState *echo_state;
201
202 spx_word16_t speech_prob; /**< Probability last frame was speech */
203
204 /* DSP-related arrays */
205 spx_word16_t *frame; /**< Processing frame (2*ps_size) */
206 spx_word16_t *ft; /**< Processing frame in freq domain (2*ps_size) */
207 spx_word32_t *ps; /**< Current power spectrum */
208 spx_word16_t *gain2; /**< Adjusted gains */
209 spx_word16_t *gain_floor; /**< Minimum gain allowed */
210 spx_word16_t *window; /**< Analysis/Synthesis window */
211 spx_word32_t *noise; /**< Noise estimate */
212 spx_word32_t *reverb_estimate; /**< Estimate of reverb energy */
213 spx_word32_t *old_ps; /**< Power spectrum for last frame */
214 spx_word16_t *gain; /**< Ephraim Malah gain */
215 spx_word16_t *prior; /**< A-priori SNR */
216 spx_word16_t *post; /**< A-posteriori SNR */
217
218 spx_word32_t *S; /**< Smoothed power spectrum */
219 spx_word32_t *Smin; /**< See Cohen paper */
220 spx_word32_t *Stmp; /**< See Cohen paper */
221 int *update_prob; /**< Probability of speech presence for noise update */
222
223 spx_word16_t *zeta; /**< Smoothed a priori SNR */
224 spx_word32_t *echo_noise;
225 spx_word32_t *residual_echo;
226
227 /* Misc */
228 spx_word16_t *inbuf; /**< Input buffer (overlapped analysis) */
229 spx_word16_t *outbuf; /**< Output buffer (for overlap and add) */
230
231 /* AGC stuff, only for floating point for now */
232 #ifndef FIXED_POINT
233 int agc_enabled;
234 float agc_level;
235 float loudness_accum;
236 float *loudness_weight; /**< Perceptual loudness curve */
237 float loudness; /**< Loudness estimate */
238 float agc_gain; /**< Current AGC gain */
239 float max_gain; /**< Maximum gain allowed */
240 float max_increase_step; /**< Maximum increase in gain from one frame to another */
241 float max_decrease_step; /**< Maximum decrease in gain from one frame to another */
242 float prev_loudness; /**< Loudness of previous frame */
243 float init_max; /**< Current gain limit during initialisation */
244 #endif
245 int nb_adapt; /**< Number of frames used for adaptation so far */
246 int was_speech;
247 int min_count; /**< Number of frames processed so far */
248 void *fft_lookup; /**< Lookup table for the FFT */
249 #ifdef FIXED_POINT
250 int frame_shift;
251 #endif
252 };
253
254
conj_window(spx_word16_t * w,int len)255 static void conj_window(spx_word16_t *w, int len)
256 {
257 int i;
258 for (i=0;i<len;i++)
259 {
260 spx_word16_t tmp;
261 #ifdef FIXED_POINT
262 spx_word16_t x = DIV32_16(MULT16_16(32767,i),len);
263 #else
264 spx_word16_t x = DIV32_16(MULT16_16(QCONST16(4.f,13),i),len);
265 #endif
266 int inv=0;
267 if (x<QCONST16(1.f,13))
268 {
269 } else if (x<QCONST16(2.f,13))
270 {
271 x=QCONST16(2.f,13)-x;
272 inv=1;
273 } else if (x<QCONST16(3.f,13))
274 {
275 x=x-QCONST16(2.f,13);
276 inv=1;
277 } else {
278 x=QCONST16(2.f,13)-x+QCONST16(2.f,13); /* 4 - x */
279 }
280 x = MULT16_16_Q14(QCONST16(1.271903f,14), x);
281 tmp = SQR16_Q15(QCONST16(.5f,15)-MULT16_16_P15(QCONST16(.5f,15),spx_cos_norm(SHL32(EXTEND32(x),2))));
282 if (inv)
283 tmp=SUB16(Q15_ONE,tmp);
284 w[i]=spx_sqrt(SHL32(EXTEND32(tmp),15));
285 }
286 }
287
288
289 #ifdef FIXED_POINT
290 /* This function approximates the gain function
291 y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
292 which multiplied by xi/(1+xi) is the optimal gain
293 in the loudness domain ( sqrt[amplitude] )
294 Input in Q11 format, output in Q15
295 */
hypergeom_gain(spx_word32_t xx)296 static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
297 {
298 int ind;
299 spx_word16_t frac;
300 /* Q13 table */
301 static const spx_word16_t table[21] = {
302 6730, 8357, 9868, 11267, 12563, 13770, 14898,
303 15959, 16961, 17911, 18816, 19682, 20512, 21311,
304 22082, 22827, 23549, 24250, 24931, 25594, 26241};
305 ind = SHR32(xx,10);
306 if (ind<0)
307 return Q15_ONE;
308 if (ind>19)
309 return ADD32(EXTEND32(Q15_ONE),EXTEND32(DIV32_16(QCONST32(.1296,23), SHR32(xx,EXPIN_SHIFT-SNR_SHIFT))));
310 frac = SHL32(xx-SHL32(ind,10),5);
311 return SHL32(DIV32_16(PSHR32(MULT16_16(Q15_ONE-frac,table[ind]) + MULT16_16(frac,table[ind+1]),7),(spx_sqrt(SHL32(xx,15)+6711))),7);
312 }
313
qcurve(spx_word16_t x)314 static inline spx_word16_t qcurve(spx_word16_t x)
315 {
316 x = MAX16(x, 1);
317 return DIV32_16(SHL32(EXTEND32(32767),9),ADD16(512,MULT16_16_Q15(QCONST16(.60f,15),DIV32_16(32767,x))));
318 }
319
320 /* Compute the gain floor based on different floors for the background noise and residual echo */
compute_gain_floor(int noise_suppress,int effective_echo_suppress,spx_word32_t * noise,spx_word32_t * echo,spx_word16_t * gain_floor,int len)321 static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
322 {
323 int i;
324
325 if (noise_suppress > effective_echo_suppress)
326 {
327 spx_word16_t noise_gain, gain_ratio;
328 noise_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),noise_suppress)),1)));
329 gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),effective_echo_suppress-noise_suppress)),1)));
330
331 /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
332 for (i=0;i<len;i++)
333 gain_floor[i] = MULT16_16_Q15(noise_gain,
334 spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(PSHR32(noise[i],NOISE_SHIFT) + MULT16_32_Q15(gain_ratio,echo[i]),
335 (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
336 } else {
337 spx_word16_t echo_gain, gain_ratio;
338 echo_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),effective_echo_suppress)),1)));
339 gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),noise_suppress-effective_echo_suppress)),1)));
340
341 /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
342 for (i=0;i<len;i++)
343 gain_floor[i] = MULT16_16_Q15(echo_gain,
344 spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(MULT16_32_Q15(gain_ratio,PSHR32(noise[i],NOISE_SHIFT)) + echo[i],
345 (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
346 }
347 }
348
349 #else
350 /* This function approximates the gain function
351 y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
352 which multiplied by xi/(1+xi) is the optimal gain
353 in the loudness domain ( sqrt[amplitude] )
354 */
hypergeom_gain(spx_word32_t xx)355 static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
356 {
357 int ind;
358 float integer, frac;
359 float x;
360 static const float table[21] = {
361 0.82157f, 1.02017f, 1.20461f, 1.37534f, 1.53363f, 1.68092f, 1.81865f,
362 1.94811f, 2.07038f, 2.18638f, 2.29688f, 2.40255f, 2.50391f, 2.60144f,
363 2.69551f, 2.78647f, 2.87458f, 2.96015f, 3.04333f, 3.12431f, 3.20326f};
364 x = EXPIN_SCALING_1*xx;
365 integer = floor(2*x);
366 ind = (int)integer;
367 if (ind<0)
368 return FRAC_SCALING;
369 if (ind>19)
370 return FRAC_SCALING*(1+.1296/x);
371 frac = 2*x-integer;
372 return FRAC_SCALING*((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f);
373 }
374
qcurve(spx_word16_t x)375 static inline spx_word16_t qcurve(spx_word16_t x)
376 {
377 return 1.f/(1.f+.15f/(SNR_SCALING_1*x));
378 }
379
compute_gain_floor(int noise_suppress,int effective_echo_suppress,spx_word32_t * noise,spx_word32_t * echo,spx_word16_t * gain_floor,int len)380 static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
381 {
382 int i;
383 float echo_floor;
384 float noise_floor;
385
386 noise_floor = exp(.2302585f*noise_suppress);
387 echo_floor = exp(.2302585f*effective_echo_suppress);
388
389 /* Compute the gain floor based on different floors for the background noise and residual echo */
390 for (i=0;i<len;i++)
391 gain_floor[i] = FRAC_SCALING*sqrt(noise_floor*PSHR32(noise[i],NOISE_SHIFT) + echo_floor*echo[i])/sqrt(1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]);
392 }
393
394 #endif
speex_preprocess_state_init(int frame_size,int sampling_rate)395 EXPORT SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_rate)
396 {
397 int i;
398 int N, N3, N4, M;
399
400 SpeexPreprocessState *st = (SpeexPreprocessState *)speex_alloc(sizeof(SpeexPreprocessState));
401 st->frame_size = frame_size;
402
403 /* Round ps_size down to the nearest power of two */
404 #if 0
405 i=1;
406 st->ps_size = st->frame_size;
407 while(1)
408 {
409 if (st->ps_size & ~i)
410 {
411 st->ps_size &= ~i;
412 i<<=1;
413 } else {
414 break;
415 }
416 }
417
418
419 if (st->ps_size < 3*st->frame_size/4)
420 st->ps_size = st->ps_size * 3 / 2;
421 #else
422 st->ps_size = st->frame_size;
423 #endif
424
425 N = st->ps_size;
426 N3 = 2*N - st->frame_size;
427 N4 = st->frame_size - N3;
428
429 st->sampling_rate = sampling_rate;
430 st->denoise_enabled = 1;
431 st->vad_enabled = 0;
432 st->dereverb_enabled = 0;
433 st->reverb_decay = 0;
434 st->reverb_level = 0;
435 st->noise_suppress = NOISE_SUPPRESS_DEFAULT;
436 st->echo_suppress = ECHO_SUPPRESS_DEFAULT;
437 st->echo_suppress_active = ECHO_SUPPRESS_ACTIVE_DEFAULT;
438
439 st->speech_prob_start = SPEECH_PROB_START_DEFAULT;
440 st->speech_prob_continue = SPEECH_PROB_CONTINUE_DEFAULT;
441
442 st->echo_state = NULL;
443
444 st->nbands = NB_BANDS;
445 M = st->nbands;
446 st->bank = filterbank_new(M, sampling_rate, N, 1);
447
448 st->frame = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
449 st->window = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
450 st->ft = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
451
452 st->ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
453 st->noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
454 st->echo_noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
455 st->residual_echo = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
456 st->reverb_estimate = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
457 st->old_ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
458 st->prior = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
459 st->post = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
460 st->gain = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
461 st->gain2 = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
462 st->gain_floor = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
463 st->zeta = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
464
465 st->S = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
466 st->Smin = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
467 st->Stmp = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
468 st->update_prob = (int*)speex_alloc(N*sizeof(int));
469
470 st->inbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
471 st->outbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
472
473 conj_window(st->window, 2*N3);
474 for (i=2*N3;i<2*st->ps_size;i++)
475 st->window[i]=Q15_ONE;
476
477 if (N4>0)
478 {
479 for (i=N3-1;i>=0;i--)
480 {
481 st->window[i+N3+N4]=st->window[i+N3];
482 st->window[i+N3]=1;
483 }
484 }
485 for (i=0;i<N+M;i++)
486 {
487 st->noise[i]=QCONST32(1.f,NOISE_SHIFT);
488 st->reverb_estimate[i]=0;
489 st->old_ps[i]=1;
490 st->gain[i]=Q15_ONE;
491 st->post[i]=SHL16(1, SNR_SHIFT);
492 st->prior[i]=SHL16(1, SNR_SHIFT);
493 }
494
495 for (i=0;i<N;i++)
496 st->update_prob[i] = 1;
497 for (i=0;i<N3;i++)
498 {
499 st->inbuf[i]=0;
500 st->outbuf[i]=0;
501 }
502 #ifndef FIXED_POINT
503 st->agc_enabled = 0;
504 st->agc_level = 8000;
505 st->loudness_weight = (float*)speex_alloc(N*sizeof(float));
506 for (i=0;i<N;i++)
507 {
508 float ff=((float)i)*.5*sampling_rate/((float)N);
509 /*st->loudness_weight[i] = .5f*(1.f/(1.f+ff/8000.f))+1.f*exp(-.5f*(ff-3800.f)*(ff-3800.f)/9e5f);*/
510 st->loudness_weight[i] = .35f-.35f*ff/16000.f+.73f*exp(-.5f*(ff-3800)*(ff-3800)/9e5f);
511 if (st->loudness_weight[i]<.01f)
512 st->loudness_weight[i]=.01f;
513 st->loudness_weight[i] *= st->loudness_weight[i];
514 }
515 /*st->loudness = pow(AMP_SCALE*st->agc_level,LOUDNESS_EXP);*/
516 st->loudness = 1e-15;
517 st->agc_gain = 1;
518 st->max_gain = 30;
519 st->max_increase_step = exp(0.11513f * 12.*st->frame_size / st->sampling_rate);
520 st->max_decrease_step = exp(-0.11513f * 40.*st->frame_size / st->sampling_rate);
521 st->prev_loudness = 1;
522 st->init_max = 1;
523 #endif
524 st->was_speech = 0;
525
526 st->fft_lookup = spx_fft_init(2*N);
527
528 st->nb_adapt=0;
529 st->min_count=0;
530 return st;
531 }
532
speex_preprocess_state_destroy(SpeexPreprocessState * st)533 EXPORT void speex_preprocess_state_destroy(SpeexPreprocessState *st)
534 {
535 speex_free(st->frame);
536 speex_free(st->ft);
537 speex_free(st->ps);
538 speex_free(st->gain2);
539 speex_free(st->gain_floor);
540 speex_free(st->window);
541 speex_free(st->noise);
542 speex_free(st->reverb_estimate);
543 speex_free(st->old_ps);
544 speex_free(st->gain);
545 speex_free(st->prior);
546 speex_free(st->post);
547 #ifndef FIXED_POINT
548 speex_free(st->loudness_weight);
549 #endif
550 speex_free(st->echo_noise);
551 speex_free(st->residual_echo);
552
553 speex_free(st->S);
554 speex_free(st->Smin);
555 speex_free(st->Stmp);
556 speex_free(st->update_prob);
557 speex_free(st->zeta);
558
559 speex_free(st->inbuf);
560 speex_free(st->outbuf);
561
562 spx_fft_destroy(st->fft_lookup);
563 filterbank_destroy(st->bank);
564 speex_free(st);
565 }
566
567 /* FIXME: The AGC doesn't work yet with fixed-point*/
568 #ifndef FIXED_POINT
speex_compute_agc(SpeexPreprocessState * st,spx_word16_t Pframe,spx_word16_t * ft)569 static void speex_compute_agc(SpeexPreprocessState *st, spx_word16_t Pframe, spx_word16_t *ft)
570 {
571 int i;
572 int N = st->ps_size;
573 float target_gain;
574 float loudness=1.f;
575 float rate;
576
577 for (i=2;i<N;i++)
578 {
579 loudness += 2.f*N*st->ps[i]* st->loudness_weight[i];
580 }
581 loudness=sqrt(loudness);
582 /*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) &&
583 loudness*2 > pow(st->loudness, 1.0/LOUDNESS_EXP))*/
584 if (Pframe>.3f)
585 {
586 /*rate=2.0f*Pframe*Pframe/(1+st->nb_loudness_adapt);*/
587 rate = .03*Pframe*Pframe;
588 st->loudness = (1-rate)*st->loudness + (rate)*pow(AMP_SCALE*loudness, LOUDNESS_EXP);
589 st->loudness_accum = (1-rate)*st->loudness_accum + rate;
590 if (st->init_max < st->max_gain && st->nb_adapt > 20)
591 st->init_max *= 1.f + .1f*Pframe*Pframe;
592 }
593 /*printf ("%f %f %f %f\n", Pframe, loudness, pow(st->loudness, 1.0f/LOUDNESS_EXP), st->loudness2);*/
594
595 target_gain = AMP_SCALE*st->agc_level*pow(st->loudness/(1e-4+st->loudness_accum), -1.0f/LOUDNESS_EXP);
596
597 if ((Pframe>.5 && st->nb_adapt > 20) || target_gain < st->agc_gain)
598 {
599 if (target_gain > st->max_increase_step*st->agc_gain)
600 target_gain = st->max_increase_step*st->agc_gain;
601 if (target_gain < st->max_decrease_step*st->agc_gain && loudness < 10*st->prev_loudness)
602 target_gain = st->max_decrease_step*st->agc_gain;
603 if (target_gain > st->max_gain)
604 target_gain = st->max_gain;
605 if (target_gain > st->init_max)
606 target_gain = st->init_max;
607
608 st->agc_gain = target_gain;
609 }
610 /*fprintf (stderr, "%f %f %f\n", loudness, (float)AMP_SCALE_1*pow(st->loudness, 1.0f/LOUDNESS_EXP), st->agc_gain);*/
611
612 for (i=0;i<2*N;i++)
613 ft[i] *= st->agc_gain;
614 st->prev_loudness = loudness;
615 }
616 #endif
617
preprocess_analysis(SpeexPreprocessState * st,spx_int16_t * x)618 static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
619 {
620 int i;
621 int N = st->ps_size;
622 int N3 = 2*N - st->frame_size;
623 int N4 = st->frame_size - N3;
624 spx_word32_t *ps=st->ps;
625
626 /* 'Build' input frame */
627 for (i=0;i<N3;i++)
628 st->frame[i]=st->inbuf[i];
629 for (i=0;i<st->frame_size;i++)
630 st->frame[N3+i]=x[i];
631
632 /* Update inbuf */
633 for (i=0;i<N3;i++)
634 st->inbuf[i]=x[N4+i];
635
636 /* Windowing */
637 for (i=0;i<2*N;i++)
638 st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
639
640 #ifdef FIXED_POINT
641 {
642 spx_word16_t max_val=0;
643 for (i=0;i<2*N;i++)
644 max_val = MAX16(max_val, ABS16(st->frame[i]));
645 st->frame_shift = 14-spx_ilog2(EXTEND32(max_val));
646 for (i=0;i<2*N;i++)
647 st->frame[i] = SHL16(st->frame[i], st->frame_shift);
648 }
649 #endif
650
651 /* Perform FFT */
652 spx_fft(st->fft_lookup, st->frame, st->ft);
653
654 /* Power spectrum */
655 ps[0]=MULT16_16(st->ft[0],st->ft[0]);
656 for (i=1;i<N;i++)
657 ps[i]=MULT16_16(st->ft[2*i-1],st->ft[2*i-1]) + MULT16_16(st->ft[2*i],st->ft[2*i]);
658 for (i=0;i<N;i++)
659 st->ps[i] = PSHR32(st->ps[i], 2*st->frame_shift);
660
661 filterbank_compute_bank32(st->bank, ps, ps+N);
662 }
663
update_noise_prob(SpeexPreprocessState * st)664 static void update_noise_prob(SpeexPreprocessState *st)
665 {
666 int i;
667 int min_range;
668 int N = st->ps_size;
669
670 for (i=1;i<N-1;i++)
671 st->S[i] = MULT16_32_Q15(QCONST16(.8f,15),st->S[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i-1])
672 + MULT16_32_Q15(QCONST16(.1f,15),st->ps[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i+1]);
673 st->S[0] = MULT16_32_Q15(QCONST16(.8f,15),st->S[0]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[0]);
674 st->S[N-1] = MULT16_32_Q15(QCONST16(.8f,15),st->S[N-1]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[N-1]);
675
676 if (st->nb_adapt==1)
677 {
678 for (i=0;i<N;i++)
679 st->Smin[i] = st->Stmp[i] = 0;
680 }
681
682 if (st->nb_adapt < 100)
683 min_range = 15;
684 else if (st->nb_adapt < 1000)
685 min_range = 50;
686 else if (st->nb_adapt < 10000)
687 min_range = 150;
688 else
689 min_range = 300;
690 if (st->min_count > min_range)
691 {
692 st->min_count = 0;
693 for (i=0;i<N;i++)
694 {
695 st->Smin[i] = MIN32(st->Stmp[i], st->S[i]);
696 st->Stmp[i] = st->S[i];
697 }
698 } else {
699 for (i=0;i<N;i++)
700 {
701 st->Smin[i] = MIN32(st->Smin[i], st->S[i]);
702 st->Stmp[i] = MIN32(st->Stmp[i], st->S[i]);
703 }
704 }
705 for (i=0;i<N;i++)
706 {
707 if (MULT16_32_Q15(QCONST16(.4f,15),st->S[i]) > st->Smin[i])
708 st->update_prob[i] = 1;
709 else
710 st->update_prob[i] = 0;
711 /*fprintf (stderr, "%f ", st->S[i]/st->Smin[i]);*/
712 /*fprintf (stderr, "%f ", st->update_prob[i]);*/
713 }
714
715 }
716
717 #define NOISE_OVERCOMPENS 1.
718
719 void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *Yout, int len);
720
speex_preprocess(SpeexPreprocessState * st,spx_int16_t * x,spx_int32_t * echo)721 EXPORT int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo)
722 {
723 return speex_preprocess_run(st, x);
724 }
725
speex_preprocess_run(SpeexPreprocessState * st,spx_int16_t * x)726 EXPORT int speex_preprocess_run(SpeexPreprocessState *st, spx_int16_t *x)
727 {
728 int i;
729 int M;
730 int N = st->ps_size;
731 int N3 = 2*N - st->frame_size;
732 int N4 = st->frame_size - N3;
733 spx_word32_t *ps=st->ps;
734 spx_word32_t Zframe;
735 spx_word16_t Pframe;
736 spx_word16_t beta, beta_1;
737 spx_word16_t effective_echo_suppress;
738
739 st->nb_adapt++;
740 if (st->nb_adapt>20000)
741 st->nb_adapt = 20000;
742 st->min_count++;
743
744 beta = MAX16(QCONST16(.03,15),DIV32_16(Q15_ONE,st->nb_adapt));
745 beta_1 = Q15_ONE-beta;
746 M = st->nbands;
747 /* Deal with residual echo if provided */
748 if (st->echo_state)
749 {
750 speex_echo_get_residual(st->echo_state, st->residual_echo, N);
751 #ifndef FIXED_POINT
752 /* If there are NaNs or ridiculous values, it'll show up in the DC and we just reset everything to zero */
753 if (!(st->residual_echo[0] >=0 && st->residual_echo[0]<N*1e9f))
754 {
755 for (i=0;i<N;i++)
756 st->residual_echo[i] = 0;
757 }
758 #endif
759 for (i=0;i<N;i++)
760 st->echo_noise[i] = MAX32(MULT16_32_Q15(QCONST16(.6f,15),st->echo_noise[i]), st->residual_echo[i]);
761 filterbank_compute_bank32(st->bank, st->echo_noise, st->echo_noise+N);
762 } else {
763 for (i=0;i<N+M;i++)
764 st->echo_noise[i] = 0;
765 }
766 preprocess_analysis(st, x);
767
768 update_noise_prob(st);
769
770 /* Noise estimation always updated for the 10 first frames */
771 /*if (st->nb_adapt<10)
772 {
773 for (i=1;i<N-1;i++)
774 st->update_prob[i] = 0;
775 }
776 */
777
778 /* Update the noise estimate for the frequencies where it can be */
779 for (i=0;i<N;i++)
780 {
781 if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i], NOISE_SHIFT))
782 st->noise[i] = MAX32(EXTEND32(0),MULT16_32_Q15(beta_1,st->noise[i]) + MULT16_32_Q15(beta,SHL32(st->ps[i],NOISE_SHIFT)));
783 }
784 filterbank_compute_bank32(st->bank, st->noise, st->noise+N);
785
786 /* Special case for first frame */
787 if (st->nb_adapt==1)
788 for (i=0;i<N+M;i++)
789 st->old_ps[i] = ps[i];
790
791 /* Compute a posteriori SNR */
792 for (i=0;i<N+M;i++)
793 {
794 spx_word16_t gamma;
795
796 /* Total noise estimate including residual echo and reverberation */
797 spx_word32_t tot_noise = ADD32(ADD32(ADD32(EXTEND32(1), PSHR32(st->noise[i],NOISE_SHIFT)) , st->echo_noise[i]) , st->reverb_estimate[i]);
798
799 /* A posteriori SNR = ps/noise - 1*/
800 st->post[i] = SUB16(DIV32_16_Q8(ps[i],tot_noise), QCONST16(1.f,SNR_SHIFT));
801 st->post[i]=MIN16(st->post[i], QCONST16(100.f,SNR_SHIFT));
802
803 /* Computing update gamma = .1 + .9*(old/(old+noise))^2 */
804 gamma = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.89f,15),SQR16_Q15(DIV32_16_Q15(st->old_ps[i],ADD32(st->old_ps[i],tot_noise))));
805
806 /* A priori SNR update = gamma*max(0,post) + (1-gamma)*old/noise */
807 st->prior[i] = EXTRACT16(PSHR32(ADD32(MULT16_16(gamma,MAX16(0,st->post[i])), MULT16_16(Q15_ONE-gamma,DIV32_16_Q8(st->old_ps[i],tot_noise))), 15));
808 st->prior[i]=MIN16(st->prior[i], QCONST16(100.f,SNR_SHIFT));
809 }
810
811 /*print_vec(st->post, N+M, "");*/
812
813 /* Recursive average of the a priori SNR. A bit smoothed for the psd components */
814 st->zeta[0] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[0]), MULT16_16(QCONST16(.3f,15),st->prior[0])),15);
815 for (i=1;i<N-1;i++)
816 st->zeta[i] = PSHR32(ADD32(ADD32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.15f,15),st->prior[i])),
817 MULT16_16(QCONST16(.075f,15),st->prior[i-1])), MULT16_16(QCONST16(.075f,15),st->prior[i+1])),15);
818 for (i=N-1;i<N+M;i++)
819 st->zeta[i] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.3f,15),st->prior[i])),15);
820
821 /* Speech probability of presence for the entire frame is based on the average filterbank a priori SNR */
822 Zframe = 0;
823 for (i=N;i<N+M;i++)
824 Zframe = ADD32(Zframe, EXTEND32(st->zeta[i]));
825 Pframe = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.899f,15),qcurve(DIV32_16(Zframe,st->nbands)));
826
827 effective_echo_suppress = EXTRACT16(PSHR32(ADD32(MULT16_16(SUB16(Q15_ONE,Pframe), st->echo_suppress), MULT16_16(Pframe, st->echo_suppress_active)),15));
828
829 compute_gain_floor(st->noise_suppress, effective_echo_suppress, st->noise+N, st->echo_noise+N, st->gain_floor+N, M);
830
831 /* Compute Ephraim & Malah gain speech probability of presence for each critical band (Bark scale)
832 Technically this is actually wrong because the EM gaim assumes a slightly different probability
833 distribution */
834 for (i=N;i<N+M;i++)
835 {
836 /* See EM and Cohen papers*/
837 spx_word32_t theta;
838 /* Gain from hypergeometric function */
839 spx_word32_t MM;
840 /* Weiner filter gain */
841 spx_word16_t prior_ratio;
842 /* a priority probability of speech presence based on Bark sub-band alone */
843 spx_word16_t P1;
844 /* Speech absence a priori probability (considering sub-band and frame) */
845 spx_word16_t q;
846 #ifdef FIXED_POINT
847 spx_word16_t tmp;
848 #endif
849
850 prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
851 theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
852
853 MM = hypergeom_gain(theta);
854 /* Gain with bound */
855 st->gain[i] = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
856 /* Save old Bark power spectrum */
857 st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
858
859 P1 = QCONST16(.199f,15)+MULT16_16_Q15(QCONST16(.8f,15),qcurve (st->zeta[i]));
860 q = Q15_ONE-MULT16_16_Q15(Pframe,P1);
861 #ifdef FIXED_POINT
862 theta = MIN32(theta, EXTEND32(32767));
863 /*Q8*/tmp = MULT16_16_Q15((SHL32(1,SNR_SHIFT)+st->prior[i]),EXTRACT16(MIN32(Q15ONE,SHR32(spx_exp(-EXTRACT16(theta)),1))));
864 tmp = MIN16(QCONST16(3.,SNR_SHIFT), tmp); /* Prevent overflows in the next line*/
865 /*Q8*/tmp = EXTRACT16(PSHR32(MULT16_16(PDIV32_16(SHL32(EXTEND32(q),8),(Q15_ONE-q)),tmp),8));
866 st->gain2[i]=DIV32_16(SHL32(EXTEND32(32767),SNR_SHIFT), ADD16(256,tmp));
867 #else
868 st->gain2[i]=1/(1.f + (q/(1.f-q))*(1+st->prior[i])*exp(-theta));
869 #endif
870 }
871 /* Convert the EM gains and speech prob to linear frequency */
872 filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
873 filterbank_compute_psd16(st->bank,st->gain+N, st->gain);
874
875 /* Use 1 for linear gain resolution (best) or 0 for Bark gain resolution (faster) */
876 if (1)
877 {
878 filterbank_compute_psd16(st->bank,st->gain_floor+N, st->gain_floor);
879
880 /* Compute gain according to the Ephraim-Malah algorithm -- linear frequency */
881 for (i=0;i<N;i++)
882 {
883 spx_word32_t MM;
884 spx_word32_t theta;
885 spx_word16_t prior_ratio;
886 spx_word16_t tmp;
887 spx_word16_t p;
888 spx_word16_t g;
889
890 /* Wiener filter gain */
891 prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
892 theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
893
894 /* Optimal estimator for loudness domain */
895 MM = hypergeom_gain(theta);
896 /* EM gain with bound */
897 g = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
898 /* Interpolated speech probability of presence */
899 p = st->gain2[i];
900
901 /* Constrain the gain to be close to the Bark scale gain */
902 if (MULT16_16_Q15(QCONST16(.333f,15),g) > st->gain[i])
903 g = MULT16_16(3,st->gain[i]);
904 st->gain[i] = g;
905
906 /* Save old power spectrum */
907 st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
908
909 /* Apply gain floor */
910 if (st->gain[i] < st->gain_floor[i])
911 st->gain[i] = st->gain_floor[i];
912
913 /* Exponential decay model for reverberation (unused) */
914 /*st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];*/
915
916 /* Take into account speech probability of presence (loudness domain MMSE estimator) */
917 /* gain2 = [p*sqrt(gain)+(1-p)*sqrt(gain _floor) ]^2 */
918 tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
919 st->gain2[i]=SQR16_Q15(tmp);
920
921 /* Use this if you want a log-domain MMSE estimator instead */
922 /*st->gain2[i] = pow(st->gain[i], p) * pow(st->gain_floor[i],1.f-p);*/
923 }
924 } else {
925 for (i=N;i<N+M;i++)
926 {
927 spx_word16_t tmp;
928 spx_word16_t p = st->gain2[i];
929 st->gain[i] = MAX16(st->gain[i], st->gain_floor[i]);
930 tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
931 st->gain2[i]=SQR16_Q15(tmp);
932 }
933 filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
934 }
935
936 /* If noise suppression is off, don't apply the gain (but then why call this in the first place!) */
937 if (!st->denoise_enabled)
938 {
939 for (i=0;i<N+M;i++)
940 st->gain2[i]=Q15_ONE;
941 }
942
943 /* Apply computed gain */
944 for (i=1;i<N;i++)
945 {
946 st->ft[2*i-1] = MULT16_16_P15(st->gain2[i],st->ft[2*i-1]);
947 st->ft[2*i] = MULT16_16_P15(st->gain2[i],st->ft[2*i]);
948 }
949 st->ft[0] = MULT16_16_P15(st->gain2[0],st->ft[0]);
950 st->ft[2*N-1] = MULT16_16_P15(st->gain2[N-1],st->ft[2*N-1]);
951
952 /*FIXME: This *will* not work for fixed-point */
953 #ifndef FIXED_POINT
954 if (st->agc_enabled)
955 speex_compute_agc(st, Pframe, st->ft);
956 #endif
957
958 /* Inverse FFT with 1/N scaling */
959 spx_ifft(st->fft_lookup, st->ft, st->frame);
960 /* Scale back to original (lower) amplitude */
961 for (i=0;i<2*N;i++)
962 st->frame[i] = PSHR16(st->frame[i], st->frame_shift);
963
964 /*FIXME: This *will* not work for fixed-point */
965 #ifndef FIXED_POINT
966 if (st->agc_enabled)
967 {
968 float max_sample=0;
969 for (i=0;i<2*N;i++)
970 if (fabs(st->frame[i])>max_sample)
971 max_sample = fabs(st->frame[i]);
972 if (max_sample>28000.f)
973 {
974 float damp = 28000.f/max_sample;
975 for (i=0;i<2*N;i++)
976 st->frame[i] *= damp;
977 }
978 }
979 #endif
980
981 /* Synthesis window (for WOLA) */
982 for (i=0;i<2*N;i++)
983 st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
984
985 /* Perform overlap and add */
986 for (i=0;i<N3;i++)
987 x[i] = st->outbuf[i] + st->frame[i];
988 for (i=0;i<N4;i++)
989 x[N3+i] = st->frame[N3+i];
990
991 /* Update outbuf */
992 for (i=0;i<N3;i++)
993 st->outbuf[i] = st->frame[st->frame_size+i];
994
995 /* FIXME: This VAD is a kludge */
996 st->speech_prob = Pframe;
997 if (st->vad_enabled)
998 {
999 if (st->speech_prob > st->speech_prob_start || (st->was_speech && st->speech_prob > st->speech_prob_continue))
1000 {
1001 st->was_speech=1;
1002 return 1;
1003 } else
1004 {
1005 st->was_speech=0;
1006 return 0;
1007 }
1008 } else {
1009 return 1;
1010 }
1011 }
1012
speex_preprocess_estimate_update(SpeexPreprocessState * st,spx_int16_t * x)1013 EXPORT void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x)
1014 {
1015 int i;
1016 int N = st->ps_size;
1017 int N3 = 2*N - st->frame_size;
1018 int M;
1019 spx_word32_t *ps=st->ps;
1020
1021 M = st->nbands;
1022 st->min_count++;
1023
1024 preprocess_analysis(st, x);
1025
1026 update_noise_prob(st);
1027
1028 for (i=1;i<N-1;i++)
1029 {
1030 if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i],NOISE_SHIFT))
1031 {
1032 st->noise[i] = MULT16_32_Q15(QCONST16(.95f,15),st->noise[i]) + MULT16_32_Q15(QCONST16(.05f,15),SHL32(st->ps[i],NOISE_SHIFT));
1033 }
1034 }
1035
1036 for (i=0;i<N3;i++)
1037 st->outbuf[i] = MULT16_16_Q15(x[st->frame_size-N3+i],st->window[st->frame_size+i]);
1038
1039 /* Save old power spectrum */
1040 for (i=0;i<N+M;i++)
1041 st->old_ps[i] = ps[i];
1042
1043 for (i=0;i<N;i++)
1044 st->reverb_estimate[i] = MULT16_32_Q15(st->reverb_decay, st->reverb_estimate[i]);
1045 }
1046
1047
speex_preprocess_ctl(SpeexPreprocessState * state,int request,void * ptr)1048 EXPORT int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
1049 {
1050 int i;
1051 SpeexPreprocessState *st;
1052 st=(SpeexPreprocessState*)state;
1053 switch(request)
1054 {
1055 case SPEEX_PREPROCESS_SET_DENOISE:
1056 st->denoise_enabled = (*(spx_int32_t*)ptr);
1057 break;
1058 case SPEEX_PREPROCESS_GET_DENOISE:
1059 (*(spx_int32_t*)ptr) = st->denoise_enabled;
1060 break;
1061 #ifndef FIXED_POINT
1062 case SPEEX_PREPROCESS_SET_AGC:
1063 st->agc_enabled = (*(spx_int32_t*)ptr);
1064 break;
1065 case SPEEX_PREPROCESS_GET_AGC:
1066 (*(spx_int32_t*)ptr) = st->agc_enabled;
1067 break;
1068 #ifndef DISABLE_FLOAT_API
1069 case SPEEX_PREPROCESS_SET_AGC_LEVEL:
1070 st->agc_level = (*(float*)ptr);
1071 if (st->agc_level<1)
1072 st->agc_level=1;
1073 if (st->agc_level>32768)
1074 st->agc_level=32768;
1075 break;
1076 case SPEEX_PREPROCESS_GET_AGC_LEVEL:
1077 (*(float*)ptr) = st->agc_level;
1078 break;
1079 #endif /* #ifndef DISABLE_FLOAT_API */
1080 case SPEEX_PREPROCESS_SET_AGC_INCREMENT:
1081 st->max_increase_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate);
1082 break;
1083 case SPEEX_PREPROCESS_GET_AGC_INCREMENT:
1084 (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_increase_step)*st->sampling_rate/st->frame_size);
1085 break;
1086 case SPEEX_PREPROCESS_SET_AGC_DECREMENT:
1087 st->max_decrease_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate);
1088 break;
1089 case SPEEX_PREPROCESS_GET_AGC_DECREMENT:
1090 (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_decrease_step)*st->sampling_rate/st->frame_size);
1091 break;
1092 case SPEEX_PREPROCESS_SET_AGC_MAX_GAIN:
1093 st->max_gain = exp(0.11513f * (*(spx_int32_t*)ptr));
1094 break;
1095 case SPEEX_PREPROCESS_GET_AGC_MAX_GAIN:
1096 (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_gain));
1097 break;
1098 #endif
1099 case SPEEX_PREPROCESS_SET_VAD:
1100 // Disabled by mlebeau, we don't want this in the launched Google Mobile App for iPhone.
1101 //speex_warning("The VAD has been replaced by a hack pending a complete rewrite");
1102 st->vad_enabled = (*(spx_int32_t*)ptr);
1103 break;
1104 case SPEEX_PREPROCESS_GET_VAD:
1105 (*(spx_int32_t*)ptr) = st->vad_enabled;
1106 break;
1107
1108 case SPEEX_PREPROCESS_SET_DEREVERB:
1109 st->dereverb_enabled = (*(spx_int32_t*)ptr);
1110 for (i=0;i<st->ps_size;i++)
1111 st->reverb_estimate[i]=0;
1112 break;
1113 case SPEEX_PREPROCESS_GET_DEREVERB:
1114 (*(spx_int32_t*)ptr) = st->dereverb_enabled;
1115 break;
1116
1117 case SPEEX_PREPROCESS_SET_DEREVERB_LEVEL:
1118 /* FIXME: Re-enable when de-reverberation is actually enabled again */
1119 /*st->reverb_level = (*(float*)ptr);*/
1120 break;
1121 case SPEEX_PREPROCESS_GET_DEREVERB_LEVEL:
1122 /* FIXME: Re-enable when de-reverberation is actually enabled again */
1123 /*(*(float*)ptr) = st->reverb_level;*/
1124 break;
1125
1126 case SPEEX_PREPROCESS_SET_DEREVERB_DECAY:
1127 /* FIXME: Re-enable when de-reverberation is actually enabled again */
1128 /*st->reverb_decay = (*(float*)ptr);*/
1129 break;
1130 case SPEEX_PREPROCESS_GET_DEREVERB_DECAY:
1131 /* FIXME: Re-enable when de-reverberation is actually enabled again */
1132 /*(*(float*)ptr) = st->reverb_decay;*/
1133 break;
1134
1135 case SPEEX_PREPROCESS_SET_PROB_START:
1136 *(spx_int32_t*)ptr = MIN32(100,MAX32(0, *(spx_int32_t*)ptr));
1137 st->speech_prob_start = DIV32_16(MULT16_16(Q15ONE,*(spx_int32_t*)ptr), 100);
1138 break;
1139 case SPEEX_PREPROCESS_GET_PROB_START:
1140 (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_start, 100);
1141 break;
1142
1143 case SPEEX_PREPROCESS_SET_PROB_CONTINUE:
1144 *(spx_int32_t*)ptr = MIN32(100,MAX32(0, *(spx_int32_t*)ptr));
1145 st->speech_prob_continue = DIV32_16(MULT16_16(Q15ONE,*(spx_int32_t*)ptr), 100);
1146 break;
1147 case SPEEX_PREPROCESS_GET_PROB_CONTINUE:
1148 (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_continue, 100);
1149 break;
1150
1151 case SPEEX_PREPROCESS_SET_NOISE_SUPPRESS:
1152 st->noise_suppress = -ABS(*(spx_int32_t*)ptr);
1153 break;
1154 case SPEEX_PREPROCESS_GET_NOISE_SUPPRESS:
1155 (*(spx_int32_t*)ptr) = st->noise_suppress;
1156 break;
1157 case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS:
1158 st->echo_suppress = -ABS(*(spx_int32_t*)ptr);
1159 break;
1160 case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS:
1161 (*(spx_int32_t*)ptr) = st->echo_suppress;
1162 break;
1163 case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS_ACTIVE:
1164 st->echo_suppress_active = -ABS(*(spx_int32_t*)ptr);
1165 break;
1166 case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS_ACTIVE:
1167 (*(spx_int32_t*)ptr) = st->echo_suppress_active;
1168 break;
1169 case SPEEX_PREPROCESS_SET_ECHO_STATE:
1170 st->echo_state = (SpeexEchoState*)ptr;
1171 break;
1172 case SPEEX_PREPROCESS_GET_ECHO_STATE:
1173 (*(SpeexEchoState**)ptr) = (SpeexEchoState*)st->echo_state;
1174 break;
1175 #ifndef FIXED_POINT
1176 case SPEEX_PREPROCESS_GET_AGC_LOUDNESS:
1177 (*(spx_int32_t*)ptr) = pow(st->loudness, 1.0/LOUDNESS_EXP);
1178 break;
1179 case SPEEX_PREPROCESS_GET_AGC_GAIN:
1180 (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->agc_gain));
1181 break;
1182 #endif
1183 case SPEEX_PREPROCESS_GET_PSD_SIZE:
1184 case SPEEX_PREPROCESS_GET_NOISE_PSD_SIZE:
1185 (*(spx_int32_t*)ptr) = st->ps_size;
1186 break;
1187 case SPEEX_PREPROCESS_GET_PSD:
1188 for(i=0;i<st->ps_size;i++)
1189 ((spx_int32_t *)ptr)[i] = (spx_int32_t) st->ps[i];
1190 break;
1191 case SPEEX_PREPROCESS_GET_NOISE_PSD:
1192 for(i=0;i<st->ps_size;i++)
1193 ((spx_int32_t *)ptr)[i] = (spx_int32_t) PSHR32(st->noise[i], NOISE_SHIFT);
1194 break;
1195 case SPEEX_PREPROCESS_GET_PROB:
1196 (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob, 100);
1197 break;
1198 #ifndef FIXED_POINT
1199 case SPEEX_PREPROCESS_SET_AGC_TARGET:
1200 st->agc_level = (*(spx_int32_t*)ptr);
1201 if (st->agc_level<1)
1202 st->agc_level=1;
1203 if (st->agc_level>32768)
1204 st->agc_level=32768;
1205 break;
1206 case SPEEX_PREPROCESS_GET_AGC_TARGET:
1207 (*(spx_int32_t*)ptr) = st->agc_level;
1208 break;
1209 #endif
1210 default:
1211 speex_warning_int("Unknown speex_preprocess_ctl request: ", request);
1212 return -1;
1213 }
1214 return 0;
1215 }
1216
1217 #ifdef FIXED_DEBUG
1218 long long spx_mips=0;
1219 #endif
1220
1221