1 /* Copyright (c) 2018 Gregor Richards
2 * Copyright (c) 2017 Mozilla */
3 /*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include <stdlib.h>
33 #include <string.h>
34 #include <stdio.h>
35 #include "kiss_fft.h"
36 #include "common.h"
37 #include <math.h>
38 #include "rnnoise.h"
39 #include "pitch.h"
40 #include "arch.h"
41 #include "rnn.h"
42 #include "rnn_data.h"
43
44 #define FRAME_SIZE_SHIFT 2
45 #define FRAME_SIZE (120<<FRAME_SIZE_SHIFT)
46 #define WINDOW_SIZE (2*FRAME_SIZE)
47 #define FREQ_SIZE (FRAME_SIZE + 1)
48
49 #define PITCH_MIN_PERIOD 60
50 #define PITCH_MAX_PERIOD 768
51 #define PITCH_FRAME_SIZE 960
52 #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)
53
54 #define SQUARE(x) ((x)*(x))
55
56 #define NB_BANDS 22
57
58 #define CEPS_MEM 8
59 #define NB_DELTA_CEPS 6
60
61 #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2)
62
63
64 #ifndef TRAINING
65 #define TRAINING 0
66 #endif
67
68
69 /* The built-in model, used if no file is given as input */
70 extern const struct RNNModel rnnoise_model_orig;
71
72
73 static const opus_int16 eband5ms[] = {
74 /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 20k*/
75 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
76 };
77
78
79 typedef struct {
80 int init;
81 kiss_fft_state *kfft;
82 float half_window[FRAME_SIZE];
83 float dct_table[NB_BANDS*NB_BANDS];
84 } CommonState;
85
86 struct DenoiseState {
87 float analysis_mem[FRAME_SIZE];
88 float cepstral_mem[CEPS_MEM][NB_BANDS];
89 int memid;
90 float synthesis_mem[FRAME_SIZE];
91 float pitch_buf[PITCH_BUF_SIZE];
92 float pitch_enh_buf[PITCH_BUF_SIZE];
93 float last_gain;
94 int last_period;
95 float mem_hp_x[2];
96 float lastg[NB_BANDS];
97 RNNState rnn;
98 };
99
compute_band_energy(float * bandE,const kiss_fft_cpx * X)100 void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
101 int i;
102 float sum[NB_BANDS] = {0};
103 for (i=0;i<NB_BANDS-1;i++)
104 {
105 int j;
106 int band_size;
107 band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT;
108 for (j=0;j<band_size;j++) {
109 float tmp;
110 float frac = (float)j/band_size;
111 tmp = SQUARE(X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r);
112 tmp += SQUARE(X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i);
113 sum[i] += (1-frac)*tmp;
114 sum[i+1] += frac*tmp;
115 }
116 }
117 sum[0] *= 2;
118 sum[NB_BANDS-1] *= 2;
119 for (i=0;i<NB_BANDS;i++)
120 {
121 bandE[i] = sum[i];
122 }
123 }
124
compute_band_corr(float * bandE,const kiss_fft_cpx * X,const kiss_fft_cpx * P)125 void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P) {
126 int i;
127 float sum[NB_BANDS] = {0};
128 for (i=0;i<NB_BANDS-1;i++)
129 {
130 int j;
131 int band_size;
132 band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT;
133 for (j=0;j<band_size;j++) {
134 float tmp;
135 float frac = (float)j/band_size;
136 tmp = X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].r;
137 tmp += X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].i;
138 sum[i] += (1-frac)*tmp;
139 sum[i+1] += frac*tmp;
140 }
141 }
142 sum[0] *= 2;
143 sum[NB_BANDS-1] *= 2;
144 for (i=0;i<NB_BANDS;i++)
145 {
146 bandE[i] = sum[i];
147 }
148 }
149
interp_band_gain(float * g,const float * bandE)150 void interp_band_gain(float *g, const float *bandE) {
151 int i;
152 memset(g, 0, FREQ_SIZE);
153 for (i=0;i<NB_BANDS-1;i++)
154 {
155 int j;
156 int band_size;
157 band_size = (eband5ms[i+1]-eband5ms[i])<<FRAME_SIZE_SHIFT;
158 for (j=0;j<band_size;j++) {
159 float frac = (float)j/band_size;
160 g[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j] = (1-frac)*bandE[i] + frac*bandE[i+1];
161 }
162 }
163 }
164
165
166 CommonState common;
167
check_init()168 static void check_init() {
169 int i;
170 if (common.init) return;
171 common.kfft = opus_fft_alloc_twiddles(2*FRAME_SIZE, NULL, NULL, NULL, 0);
172 for (i=0;i<FRAME_SIZE;i++)
173 common.half_window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE));
174 for (i=0;i<NB_BANDS;i++) {
175 int j;
176 for (j=0;j<NB_BANDS;j++) {
177 common.dct_table[i*NB_BANDS + j] = cos((i+.5)*j*M_PI/NB_BANDS);
178 if (j==0) common.dct_table[i*NB_BANDS + j] *= sqrt(.5);
179 }
180 }
181 common.init = 1;
182 }
183
dct(float * out,const float * in)184 static void dct(float *out, const float *in) {
185 int i;
186 check_init();
187 for (i=0;i<NB_BANDS;i++) {
188 int j;
189 float sum = 0;
190 for (j=0;j<NB_BANDS;j++) {
191 sum += in[j] * common.dct_table[j*NB_BANDS + i];
192 }
193 out[i] = sum*sqrt(2./22);
194 }
195 }
196
197 #if 0
198 static void idct(float *out, const float *in) {
199 int i;
200 check_init();
201 for (i=0;i<NB_BANDS;i++) {
202 int j;
203 float sum = 0;
204 for (j=0;j<NB_BANDS;j++) {
205 sum += in[j] * common.dct_table[i*NB_BANDS + j];
206 }
207 out[i] = sum*sqrt(2./22);
208 }
209 }
210 #endif
211
forward_transform(kiss_fft_cpx * out,const float * in)212 static void forward_transform(kiss_fft_cpx *out, const float *in) {
213 int i;
214 kiss_fft_cpx x[WINDOW_SIZE];
215 kiss_fft_cpx y[WINDOW_SIZE];
216 check_init();
217 for (i=0;i<WINDOW_SIZE;i++) {
218 x[i].r = in[i];
219 x[i].i = 0;
220 }
221 opus_fft(common.kfft, x, y, 0);
222 for (i=0;i<FREQ_SIZE;i++) {
223 out[i] = y[i];
224 }
225 }
226
inverse_transform(float * out,const kiss_fft_cpx * in)227 static void inverse_transform(float *out, const kiss_fft_cpx *in) {
228 int i;
229 kiss_fft_cpx x[WINDOW_SIZE];
230 kiss_fft_cpx y[WINDOW_SIZE];
231 check_init();
232 for (i=0;i<FREQ_SIZE;i++) {
233 x[i] = in[i];
234 }
235 for (;i<WINDOW_SIZE;i++) {
236 x[i].r = x[WINDOW_SIZE - i].r;
237 x[i].i = -x[WINDOW_SIZE - i].i;
238 }
239 opus_fft(common.kfft, x, y, 0);
240 /* output in reverse order for IFFT. */
241 out[0] = WINDOW_SIZE*y[0].r;
242 for (i=1;i<WINDOW_SIZE;i++) {
243 out[i] = WINDOW_SIZE*y[WINDOW_SIZE - i].r;
244 }
245 }
246
apply_window(float * x)247 static void apply_window(float *x) {
248 int i;
249 check_init();
250 for (i=0;i<FRAME_SIZE;i++) {
251 x[i] *= common.half_window[i];
252 x[WINDOW_SIZE - 1 - i] *= common.half_window[i];
253 }
254 }
255
rnnoise_get_size()256 int rnnoise_get_size() {
257 return sizeof(DenoiseState);
258 }
259
rnnoise_get_frame_size()260 int rnnoise_get_frame_size() {
261 return FRAME_SIZE;
262 }
263
rnnoise_init(DenoiseState * st,RNNModel * model)264 int rnnoise_init(DenoiseState *st, RNNModel *model) {
265 memset(st, 0, sizeof(*st));
266 if (model)
267 st->rnn.model = model;
268 else
269 st->rnn.model = &rnnoise_model_orig;
270 st->rnn.vad_gru_state = calloc(sizeof(float), st->rnn.model->vad_gru_size);
271 st->rnn.noise_gru_state = calloc(sizeof(float), st->rnn.model->noise_gru_size);
272 st->rnn.denoise_gru_state = calloc(sizeof(float), st->rnn.model->denoise_gru_size);
273 return 0;
274 }
275
rnnoise_create(RNNModel * model)276 DenoiseState *rnnoise_create(RNNModel *model) {
277 DenoiseState *st;
278 st = malloc(rnnoise_get_size());
279 rnnoise_init(st, model);
280 return st;
281 }
282
rnnoise_destroy(DenoiseState * st)283 void rnnoise_destroy(DenoiseState *st) {
284 free(st->rnn.vad_gru_state);
285 free(st->rnn.noise_gru_state);
286 free(st->rnn.denoise_gru_state);
287 free(st);
288 }
289
290 #if TRAINING
291 int lowpass = FREQ_SIZE;
292 int band_lp = NB_BANDS;
293 #endif
294
frame_analysis(DenoiseState * st,kiss_fft_cpx * X,float * Ex,const float * in)295 static void frame_analysis(DenoiseState *st, kiss_fft_cpx *X, float *Ex, const float *in) {
296 int i;
297 float x[WINDOW_SIZE];
298 RNN_COPY(x, st->analysis_mem, FRAME_SIZE);
299 for (i=0;i<FRAME_SIZE;i++) x[FRAME_SIZE + i] = in[i];
300 RNN_COPY(st->analysis_mem, in, FRAME_SIZE);
301 apply_window(x);
302 forward_transform(X, x);
303 #if TRAINING
304 for (i=lowpass;i<FREQ_SIZE;i++)
305 X[i].r = X[i].i = 0;
306 #endif
307 compute_band_energy(Ex, X);
308 }
309
compute_frame_features(DenoiseState * st,kiss_fft_cpx * X,kiss_fft_cpx * P,float * Ex,float * Ep,float * Exp,float * features,const float * in)310 static int compute_frame_features(DenoiseState *st, kiss_fft_cpx *X, kiss_fft_cpx *P,
311 float *Ex, float *Ep, float *Exp, float *features, const float *in) {
312 int i;
313 float E = 0;
314 float *ceps_0, *ceps_1, *ceps_2;
315 float spec_variability = 0;
316 float Ly[NB_BANDS];
317 float p[WINDOW_SIZE];
318 float pitch_buf[PITCH_BUF_SIZE>>1];
319 int pitch_index;
320 float gain;
321 float *(pre[1]);
322 float tmp[NB_BANDS];
323 float follow, logMax;
324 frame_analysis(st, X, Ex, in);
325 RNN_MOVE(st->pitch_buf, &st->pitch_buf[FRAME_SIZE], PITCH_BUF_SIZE-FRAME_SIZE);
326 RNN_COPY(&st->pitch_buf[PITCH_BUF_SIZE-FRAME_SIZE], in, FRAME_SIZE);
327 pre[0] = &st->pitch_buf[0];
328 pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1);
329 pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE,
330 PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index);
331 pitch_index = PITCH_MAX_PERIOD-pitch_index;
332
333 gain = remove_doubling(pitch_buf, PITCH_MAX_PERIOD, PITCH_MIN_PERIOD,
334 PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain);
335 st->last_period = pitch_index;
336 st->last_gain = gain;
337 for (i=0;i<WINDOW_SIZE;i++)
338 p[i] = st->pitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i];
339 apply_window(p);
340 forward_transform(P, p);
341 compute_band_energy(Ep, P);
342 compute_band_corr(Exp, X, P);
343 for (i=0;i<NB_BANDS;i++) Exp[i] = Exp[i]/sqrt(.001+Ex[i]*Ep[i]);
344 dct(tmp, Exp);
345 for (i=0;i<NB_DELTA_CEPS;i++) features[NB_BANDS+2*NB_DELTA_CEPS+i] = tmp[i];
346 features[NB_BANDS+2*NB_DELTA_CEPS] -= 1.3;
347 features[NB_BANDS+2*NB_DELTA_CEPS+1] -= 0.9;
348 features[NB_BANDS+3*NB_DELTA_CEPS] = .01*(pitch_index-300);
349 logMax = -2;
350 follow = -2;
351 for (i=0;i<NB_BANDS;i++) {
352 Ly[i] = log10(1e-2+Ex[i]);
353 Ly[i] = MAX16(logMax-7, MAX16(follow-1.5, Ly[i]));
354 logMax = MAX16(logMax, Ly[i]);
355 follow = MAX16(follow-1.5, Ly[i]);
356 E += Ex[i];
357 }
358 if (!TRAINING && E < 0.04) {
359 /* If there's no audio, avoid messing up the state. */
360 RNN_CLEAR(features, NB_FEATURES);
361 return 1;
362 }
363 dct(features, Ly);
364 features[0] -= 12;
365 features[1] -= 4;
366 ceps_0 = st->cepstral_mem[st->memid];
367 ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1];
368 ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2];
369 for (i=0;i<NB_BANDS;i++) ceps_0[i] = features[i];
370 st->memid++;
371 for (i=0;i<NB_DELTA_CEPS;i++) {
372 features[i] = ceps_0[i] + ceps_1[i] + ceps_2[i];
373 features[NB_BANDS+i] = ceps_0[i] - ceps_2[i];
374 features[NB_BANDS+NB_DELTA_CEPS+i] = ceps_0[i] - 2*ceps_1[i] + ceps_2[i];
375 }
376 /* Spectral variability features. */
377 if (st->memid == CEPS_MEM) st->memid = 0;
378 for (i=0;i<CEPS_MEM;i++)
379 {
380 int j;
381 float mindist = 1e15f;
382 for (j=0;j<CEPS_MEM;j++)
383 {
384 int k;
385 float dist=0;
386 for (k=0;k<NB_BANDS;k++)
387 {
388 float tmp;
389 tmp = st->cepstral_mem[i][k] - st->cepstral_mem[j][k];
390 dist += tmp*tmp;
391 }
392 if (j!=i)
393 mindist = MIN32(mindist, dist);
394 }
395 spec_variability += mindist;
396 }
397 features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1;
398 return TRAINING && E < 0.1;
399 }
400
frame_synthesis(DenoiseState * st,float * out,const kiss_fft_cpx * y)401 static void frame_synthesis(DenoiseState *st, float *out, const kiss_fft_cpx *y) {
402 float x[WINDOW_SIZE];
403 int i;
404 inverse_transform(x, y);
405 apply_window(x);
406 for (i=0;i<FRAME_SIZE;i++) out[i] = x[i] + st->synthesis_mem[i];
407 RNN_COPY(st->synthesis_mem, &x[FRAME_SIZE], FRAME_SIZE);
408 }
409
biquad(float * y,float mem[2],const float * x,const float * b,const float * a,int N)410 static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
411 int i;
412 for (i=0;i<N;i++) {
413 float xi, yi;
414 xi = x[i];
415 yi = x[i] + mem[0];
416 mem[0] = mem[1] + (b[0]*(double)xi - a[0]*(double)yi);
417 mem[1] = (b[1]*(double)xi - a[1]*(double)yi);
418 y[i] = yi;
419 }
420 }
421
pitch_filter(kiss_fft_cpx * X,const kiss_fft_cpx * P,const float * Ex,const float * Ep,const float * Exp,const float * g)422 void pitch_filter(kiss_fft_cpx *X, const kiss_fft_cpx *P, const float *Ex, const float *Ep,
423 const float *Exp, const float *g) {
424 int i;
425 float r[NB_BANDS];
426 float rf[FREQ_SIZE] = {0};
427 for (i=0;i<NB_BANDS;i++) {
428 #if 0
429 if (Exp[i]>g[i]) r[i] = 1;
430 else r[i] = Exp[i]*(1-g[i])/(.001 + g[i]*(1-Exp[i]));
431 r[i] = MIN16(1, MAX16(0, r[i]));
432 #else
433 if (Exp[i]>g[i]) r[i] = 1;
434 else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i])));
435 r[i] = sqrt(MIN16(1, MAX16(0, r[i])));
436 #endif
437 r[i] *= sqrt(Ex[i]/(1e-8+Ep[i]));
438 }
439 interp_band_gain(rf, r);
440 for (i=0;i<FREQ_SIZE;i++) {
441 X[i].r += rf[i]*P[i].r;
442 X[i].i += rf[i]*P[i].i;
443 }
444 float newE[NB_BANDS];
445 compute_band_energy(newE, X);
446 float norm[NB_BANDS];
447 float normf[FREQ_SIZE]={0};
448 for (i=0;i<NB_BANDS;i++) {
449 norm[i] = sqrt(Ex[i]/(1e-8+newE[i]));
450 }
451 interp_band_gain(normf, norm);
452 for (i=0;i<FREQ_SIZE;i++) {
453 X[i].r *= normf[i];
454 X[i].i *= normf[i];
455 }
456 }
457
rnnoise_process_frame(DenoiseState * st,float * out,const float * in)458 float rnnoise_process_frame(DenoiseState *st, float *out, const float *in) {
459 int i;
460 kiss_fft_cpx X[FREQ_SIZE];
461 kiss_fft_cpx P[WINDOW_SIZE];
462 float x[FRAME_SIZE];
463 float Ex[NB_BANDS], Ep[NB_BANDS];
464 float Exp[NB_BANDS];
465 float features[NB_FEATURES];
466 float g[NB_BANDS];
467 float gf[FREQ_SIZE]={1};
468 float vad_prob = 0;
469 int silence;
470 static const float a_hp[2] = {-1.99599, 0.99600};
471 static const float b_hp[2] = {-2, 1};
472 biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
473 silence = compute_frame_features(st, X, P, Ex, Ep, Exp, features, x);
474
475 if (!silence) {
476 compute_rnn(&st->rnn, g, &vad_prob, features);
477 pitch_filter(X, P, Ex, Ep, Exp, g);
478 for (i=0;i<NB_BANDS;i++) {
479 float alpha = .6f;
480 g[i] = MAX16(g[i], alpha*st->lastg[i]);
481 st->lastg[i] = g[i];
482 }
483 interp_band_gain(gf, g);
484 #if 1
485 for (i=0;i<FREQ_SIZE;i++) {
486 X[i].r *= gf[i];
487 X[i].i *= gf[i];
488 }
489 #endif
490 }
491
492 frame_synthesis(st, out, X);
493 return vad_prob;
494 }
495
496 #if TRAINING
497
uni_rand()498 static float uni_rand() {
499 return rand()/(double)RAND_MAX-.5;
500 }
501
rand_resp(float * a,float * b)502 static void rand_resp(float *a, float *b) {
503 a[0] = .75*uni_rand();
504 a[1] = .75*uni_rand();
505 b[0] = .75*uni_rand();
506 b[1] = .75*uni_rand();
507 }
508
main(int argc,char ** argv)509 int main(int argc, char **argv) {
510 int i;
511 int count=0;
512 static const float a_hp[2] = {-1.99599, 0.99600};
513 static const float b_hp[2] = {-2, 1};
514 float a_noise[2] = {0};
515 float b_noise[2] = {0};
516 float a_sig[2] = {0};
517 float b_sig[2] = {0};
518 float mem_hp_x[2]={0};
519 float mem_hp_n[2]={0};
520 float mem_resp_x[2]={0};
521 float mem_resp_n[2]={0};
522 float x[FRAME_SIZE];
523 float n[FRAME_SIZE];
524 float xn[FRAME_SIZE];
525 int vad_cnt=0;
526 int gain_change_count=0;
527 float speech_gain = 1, noise_gain = 1;
528 FILE *f1, *f2;
529 int maxCount;
530 DenoiseState *st;
531 DenoiseState *noise_state;
532 DenoiseState *noisy;
533 st = rnnoise_create(NULL);
534 noise_state = rnnoise_create(NULL);
535 noisy = rnnoise_create(NULL);
536 if (argc!=4) {
537 fprintf(stderr, "usage: %s <speech> <noise> <count>\n", argv[0]);
538 return 1;
539 }
540 f1 = fopen(argv[1], "r");
541 f2 = fopen(argv[2], "r");
542 maxCount = atoi(argv[3]);
543 for(i=0;i<150;i++) {
544 short tmp[FRAME_SIZE];
545 fread(tmp, sizeof(short), FRAME_SIZE, f2);
546 }
547 while (1) {
548 kiss_fft_cpx X[FREQ_SIZE], Y[FREQ_SIZE], N[FREQ_SIZE], P[WINDOW_SIZE];
549 float Ex[NB_BANDS], Ey[NB_BANDS], En[NB_BANDS], Ep[NB_BANDS];
550 float Exp[NB_BANDS];
551 float Ln[NB_BANDS];
552 float features[NB_FEATURES];
553 float g[NB_BANDS];
554 short tmp[FRAME_SIZE];
555 float vad=0;
556 float E=0;
557 if (count==maxCount) break;
558 if ((count%1000)==0) fprintf(stderr, "%d\r", count);
559 if (++gain_change_count > 2821) {
560 speech_gain = pow(10., (-40+(rand()%60))/20.);
561 noise_gain = pow(10., (-30+(rand()%50))/20.);
562 if (rand()%10==0) noise_gain = 0;
563 noise_gain *= speech_gain;
564 if (rand()%10==0) speech_gain = 0;
565 gain_change_count = 0;
566 rand_resp(a_noise, b_noise);
567 rand_resp(a_sig, b_sig);
568 lowpass = FREQ_SIZE * 3000./24000. * pow(50., rand()/(double)RAND_MAX);
569 for (i=0;i<NB_BANDS;i++) {
570 if (eband5ms[i]<<FRAME_SIZE_SHIFT > lowpass) {
571 band_lp = i;
572 break;
573 }
574 }
575 }
576 if (speech_gain != 0) {
577 fread(tmp, sizeof(short), FRAME_SIZE, f1);
578 if (feof(f1)) {
579 rewind(f1);
580 fread(tmp, sizeof(short), FRAME_SIZE, f1);
581 }
582 for (i=0;i<FRAME_SIZE;i++) x[i] = speech_gain*tmp[i];
583 for (i=0;i<FRAME_SIZE;i++) E += tmp[i]*(float)tmp[i];
584 } else {
585 for (i=0;i<FRAME_SIZE;i++) x[i] = 0;
586 E = 0;
587 }
588 if (noise_gain!=0) {
589 fread(tmp, sizeof(short), FRAME_SIZE, f2);
590 if (feof(f2)) {
591 rewind(f2);
592 fread(tmp, sizeof(short), FRAME_SIZE, f2);
593 }
594 for (i=0;i<FRAME_SIZE;i++) n[i] = noise_gain*tmp[i];
595 } else {
596 for (i=0;i<FRAME_SIZE;i++) n[i] = 0;
597 }
598 biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);
599 biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);
600 biquad(n, mem_hp_n, n, b_hp, a_hp, FRAME_SIZE);
601 biquad(n, mem_resp_n, n, b_noise, a_noise, FRAME_SIZE);
602 for (i=0;i<FRAME_SIZE;i++) xn[i] = x[i] + n[i];
603 if (E > 1e9f) {
604 vad_cnt=0;
605 } else if (E > 1e8f) {
606 vad_cnt -= 5;
607 } else if (E > 1e7f) {
608 vad_cnt++;
609 } else {
610 vad_cnt+=2;
611 }
612 if (vad_cnt < 0) vad_cnt = 0;
613 if (vad_cnt > 15) vad_cnt = 15;
614
615 if (vad_cnt >= 10) vad = 0;
616 else if (vad_cnt > 0) vad = 0.5f;
617 else vad = 1.f;
618
619 frame_analysis(st, Y, Ey, x);
620 frame_analysis(noise_state, N, En, n);
621 for (i=0;i<NB_BANDS;i++) Ln[i] = log10(1e-2+En[i]);
622 int silence = compute_frame_features(noisy, X, P, Ex, Ep, Exp, features, xn);
623 pitch_filter(X, P, Ex, Ep, Exp, g);
624 //printf("%f %d\n", noisy->last_gain, noisy->last_period);
625 for (i=0;i<NB_BANDS;i++) {
626 g[i] = sqrt((Ey[i]+1e-3)/(Ex[i]+1e-3));
627 if (g[i] > 1) g[i] = 1;
628 if (silence || i > band_lp) g[i] = -1;
629 if (Ey[i] < 5e-2 && Ex[i] < 5e-2) g[i] = -1;
630 if (vad==0 && noise_gain==0) g[i] = -1;
631 }
632 count++;
633 #if 1
634 fwrite(features, sizeof(float), NB_FEATURES, stdout);
635 fwrite(g, sizeof(float), NB_BANDS, stdout);
636 fwrite(Ln, sizeof(float), NB_BANDS, stdout);
637 fwrite(&vad, sizeof(float), 1, stdout);
638 #endif
639 }
640 fprintf(stderr, "matrix size: %d x %d\n", count, NB_FEATURES + 2*NB_BANDS + 1);
641 fclose(f1);
642 fclose(f2);
643 return 0;
644 }
645
646 #endif
647