1 /* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3 /*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 #ifndef M_PI
43 #define M_PI 3.141592653
44 #endif
45
46 static const float dct_table[128] = {
47 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
48 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
50 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
51 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
52 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
53 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
54 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
55 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
56 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
58 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
59 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
60 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
61 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
62 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
63 };
64
65 static const float analysis_window[240] = {
66 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
67 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
68 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
69 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
70 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
71 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
72 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
73 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
74 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
75 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
76 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
77 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
78 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
79 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
80 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
81 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
82 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
83 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
84 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
85 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
86 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
87 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
88 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
89 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
90 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
91 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
92 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
93 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
94 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
95 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
96 };
97
98 static const int tbands[NB_TBANDS+1] = {
99 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
100 };
101
102 static const int extra_bands[NB_TOT_BANDS+1] = {
103 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
104 };
105
106 /*static const float tweight[NB_TBANDS+1] = {
107 .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
108 };*/
109
110 #define NB_TONAL_SKIP_BANDS 9
111
112 #define cA 0.43157974f
113 #define cB 0.67848403f
114 #define cC 0.08595542f
115 #define cE ((float)M_PI/2)
fast_atan2f(float y,float x)116 static OPUS_INLINE float fast_atan2f(float y, float x) {
117 float x2, y2;
118 /* Should avoid underflow on the values we'll get */
119 if (ABS16(x)+ABS16(y)<1e-9f)
120 {
121 x*=1e12f;
122 y*=1e12f;
123 }
124 x2 = x*x;
125 y2 = y*y;
126 if(x2<y2){
127 float den = (y2 + cB*x2) * (y2 + cC*x2);
128 if (den!=0)
129 return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
130 else
131 return (y<0 ? -cE : cE);
132 }else{
133 float den = (x2 + cB*y2) * (x2 + cC*y2);
134 if (den!=0)
135 return x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
136 else
137 return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138 }
139 }
140
tonality_analysis_init(TonalityAnalysisState * tonal)141 void tonality_analysis_init(TonalityAnalysisState *tonal)
142 {
143 /* Initialize reusable fields. */
144 tonal->arch = opus_select_arch();
145 /* Clear remaining fields. */
146 tonality_analysis_reset(tonal);
147 }
148
tonality_analysis_reset(TonalityAnalysisState * tonal)149 void tonality_analysis_reset(TonalityAnalysisState *tonal)
150 {
151 /* Clear non-reusable fields. */
152 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
153 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
154 }
155
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)156 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
157 {
158 int pos;
159 int curr_lookahead;
160 float psum;
161 int i;
162
163 pos = tonal->read_pos;
164 curr_lookahead = tonal->write_pos-tonal->read_pos;
165 if (curr_lookahead<0)
166 curr_lookahead += DETECT_SIZE;
167
168 if (len > 480 && pos != tonal->write_pos)
169 {
170 pos++;
171 if (pos==DETECT_SIZE)
172 pos=0;
173 }
174 if (pos == tonal->write_pos)
175 pos--;
176 if (pos<0)
177 pos = DETECT_SIZE-1;
178 OPUS_COPY(info_out, &tonal->info[pos], 1);
179 tonal->read_subframe += len/120;
180 while (tonal->read_subframe>=4)
181 {
182 tonal->read_subframe -= 4;
183 tonal->read_pos++;
184 }
185 if (tonal->read_pos>=DETECT_SIZE)
186 tonal->read_pos-=DETECT_SIZE;
187
188 /* Compensate for the delay in the features themselves.
189 FIXME: Need a better estimate the 10 I just made up */
190 curr_lookahead = IMAX(curr_lookahead-10, 0);
191
192 psum=0;
193 /* Summing the probability of transition patterns that involve music at
194 time (DETECT_SIZE-curr_lookahead-1) */
195 for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
196 psum += tonal->pmusic[i];
197 for (;i<DETECT_SIZE;i++)
198 psum += tonal->pspeech[i];
199 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
200 /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
201
202 info_out->music_prob = psum;
203 }
204
tonality_analysis(TonalityAnalysisState * tonal,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)205 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
206 {
207 int i, b;
208 const kiss_fft_state *kfft;
209 VARDECL(kiss_fft_cpx, in);
210 VARDECL(kiss_fft_cpx, out);
211 int N = 480, N2=240;
212 float * OPUS_RESTRICT A = tonal->angle;
213 float * OPUS_RESTRICT dA = tonal->d_angle;
214 float * OPUS_RESTRICT d2A = tonal->d2_angle;
215 VARDECL(float, tonality);
216 VARDECL(float, noisiness);
217 float band_tonality[NB_TBANDS];
218 float logE[NB_TBANDS];
219 float BFCC[8];
220 float features[25];
221 float frame_tonality;
222 float max_frame_tonality;
223 /*float tw_sum=0;*/
224 float frame_noisiness;
225 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
226 float slope=0;
227 float frame_stationarity;
228 float relativeE;
229 float frame_probs[2];
230 float alpha, alphaE, alphaE2;
231 float frame_loudness;
232 float bandwidth_mask;
233 int bandwidth=0;
234 float maxE = 0;
235 float noise_floor;
236 int remaining;
237 AnalysisInfo *info;
238 SAVE_STACK;
239
240 tonal->last_transition++;
241 alpha = 1.f/IMIN(20, 1+tonal->count);
242 alphaE = 1.f/IMIN(50, 1+tonal->count);
243 alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
244
245 if (tonal->count<4)
246 tonal->music_prob = .5;
247 kfft = celt_mode->mdct.kfft[0];
248 if (tonal->count==0)
249 tonal->mem_fill = 240;
250 downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
251 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
252 {
253 tonal->mem_fill += len;
254 /* Don't have enough to update the analysis */
255 RESTORE_STACK;
256 return;
257 }
258 info = &tonal->info[tonal->write_pos++];
259 if (tonal->write_pos>=DETECT_SIZE)
260 tonal->write_pos-=DETECT_SIZE;
261
262 ALLOC(in, 480, kiss_fft_cpx);
263 ALLOC(out, 480, kiss_fft_cpx);
264 ALLOC(tonality, 240, float);
265 ALLOC(noisiness, 240, float);
266 for (i=0;i<N2;i++)
267 {
268 float w = analysis_window[i];
269 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
270 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
271 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
272 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
273 }
274 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
275 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
276 downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
277 tonal->mem_fill = 240 + remaining;
278 opus_fft(kfft, in, out, tonal->arch);
279 #ifndef FIXED_POINT
280 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
281 if (celt_isnan(out[0].r))
282 {
283 info->valid = 0;
284 RESTORE_STACK;
285 return;
286 }
287 #endif
288
289 for (i=1;i<N2;i++)
290 {
291 float X1r, X2r, X1i, X2i;
292 float angle, d_angle, d2_angle;
293 float angle2, d_angle2, d2_angle2;
294 float mod1, mod2, avg_mod;
295 X1r = (float)out[i].r+out[N-i].r;
296 X1i = (float)out[i].i-out[N-i].i;
297 X2r = (float)out[i].i+out[N-i].i;
298 X2i = (float)out[N-i].r-out[i].r;
299
300 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
301 d_angle = angle - A[i];
302 d2_angle = d_angle - dA[i];
303
304 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
305 d_angle2 = angle2 - angle;
306 d2_angle2 = d_angle2 - d_angle;
307
308 mod1 = d2_angle - (float)floor(.5+d2_angle);
309 noisiness[i] = ABS16(mod1);
310 mod1 *= mod1;
311 mod1 *= mod1;
312
313 mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
314 noisiness[i] += ABS16(mod2);
315 mod2 *= mod2;
316 mod2 *= mod2;
317
318 avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
319 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
320
321 A[i] = angle2;
322 dA[i] = d_angle2;
323 d2A[i] = mod2;
324 }
325
326 frame_tonality = 0;
327 max_frame_tonality = 0;
328 /*tw_sum = 0;*/
329 info->activity = 0;
330 frame_noisiness = 0;
331 frame_stationarity = 0;
332 if (!tonal->count)
333 {
334 for (b=0;b<NB_TBANDS;b++)
335 {
336 tonal->lowE[b] = 1e10;
337 tonal->highE[b] = -1e10;
338 }
339 }
340 relativeE = 0;
341 frame_loudness = 0;
342 for (b=0;b<NB_TBANDS;b++)
343 {
344 float E=0, tE=0, nE=0;
345 float L1, L2;
346 float stationarity;
347 for (i=tbands[b];i<tbands[b+1];i++)
348 {
349 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
350 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
351 #ifdef FIXED_POINT
352 /* FIXME: It's probably best to change the BFCC filter initial state instead */
353 binE *= 5.55e-17f;
354 #endif
355 E += binE;
356 tE += binE*tonality[i];
357 nE += binE*2.f*(.5f-noisiness[i]);
358 }
359 #ifndef FIXED_POINT
360 /* Check for extreme band energies that could cause NaNs later. */
361 if (!(E<1e9f) || celt_isnan(E))
362 {
363 info->valid = 0;
364 RESTORE_STACK;
365 return;
366 }
367 #endif
368
369 tonal->E[tonal->E_count][b] = E;
370 frame_noisiness += nE/(1e-15f+E);
371
372 frame_loudness += (float)sqrt(E+1e-10f);
373 logE[b] = (float)log(E+1e-10f);
374 tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
375 tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
376 if (tonal->highE[b] < tonal->lowE[b]+1.f)
377 {
378 tonal->highE[b]+=.5f;
379 tonal->lowE[b]-=.5f;
380 }
381 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
382
383 L1=L2=0;
384 for (i=0;i<NB_FRAMES;i++)
385 {
386 L1 += (float)sqrt(tonal->E[i][b]);
387 L2 += tonal->E[i][b];
388 }
389
390 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
391 stationarity *= stationarity;
392 stationarity *= stationarity;
393 frame_stationarity += stationarity;
394 /*band_tonality[b] = tE/(1e-15+E)*/;
395 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
396 #if 0
397 if (b>=NB_TONAL_SKIP_BANDS)
398 {
399 frame_tonality += tweight[b]*band_tonality[b];
400 tw_sum += tweight[b];
401 }
402 #else
403 frame_tonality += band_tonality[b];
404 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
405 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
406 #endif
407 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
408 slope += band_tonality[b]*(b-8);
409 /*printf("%f %f ", band_tonality[b], stationarity);*/
410 tonal->prev_band_tonality[b] = band_tonality[b];
411 }
412
413 bandwidth_mask = 0;
414 bandwidth = 0;
415 maxE = 0;
416 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
417 #ifdef FIXED_POINT
418 noise_floor *= 1<<(15+SIG_SHIFT);
419 #endif
420 noise_floor *= noise_floor;
421 for (b=0;b<NB_TOT_BANDS;b++)
422 {
423 float E=0;
424 int band_start, band_end;
425 /* Keep a margin of 300 Hz for aliasing */
426 band_start = extra_bands[b];
427 band_end = extra_bands[b+1];
428 for (i=band_start;i<band_end;i++)
429 {
430 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
431 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
432 E += binE;
433 }
434 maxE = MAX32(maxE, E);
435 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
436 E = MAX32(E, tonal->meanE[b]);
437 /* Use a simple follower with 13 dB/Bark slope for spreading function */
438 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
439 /* Consider the band "active" only if all these conditions are met:
440 1) less than 10 dB below the simple follower
441 2) less than 90 dB below the peak band (maximal masking possible considering
442 both the ATH and the loudness-dependent slope of the spreading function)
443 3) above the PCM quantization noise floor
444 */
445 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
446 bandwidth = b;
447 }
448 if (tonal->count<=2)
449 bandwidth = 20;
450 frame_loudness = 20*(float)log10(frame_loudness);
451 tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
452 tonal->lowECount *= (1-alphaE);
453 if (frame_loudness < tonal->Etracker-30)
454 tonal->lowECount += alphaE;
455
456 for (i=0;i<8;i++)
457 {
458 float sum=0;
459 for (b=0;b<16;b++)
460 sum += dct_table[i*16+b]*logE[b];
461 BFCC[i] = sum;
462 }
463
464 frame_stationarity /= NB_TBANDS;
465 relativeE /= NB_TBANDS;
466 if (tonal->count<10)
467 relativeE = .5;
468 frame_noisiness /= NB_TBANDS;
469 #if 1
470 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
471 #else
472 info->activity = .5*(1+frame_noisiness-frame_stationarity);
473 #endif
474 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
475 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
476 tonal->prev_tonality = frame_tonality;
477
478 slope /= 8*8;
479 info->tonality_slope = slope;
480
481 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
482 tonal->count++;
483 info->tonality = frame_tonality;
484
485 for (i=0;i<4;i++)
486 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
487
488 for (i=0;i<4;i++)
489 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
490
491 for (i=0;i<4;i++)
492 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
493 for (i=0;i<3;i++)
494 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
495
496 if (tonal->count > 5)
497 {
498 for (i=0;i<9;i++)
499 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
500 }
501
502 for (i=0;i<8;i++)
503 {
504 tonal->mem[i+24] = tonal->mem[i+16];
505 tonal->mem[i+16] = tonal->mem[i+8];
506 tonal->mem[i+8] = tonal->mem[i];
507 tonal->mem[i] = BFCC[i];
508 }
509 for (i=0;i<9;i++)
510 features[11+i] = (float)sqrt(tonal->std[i]);
511 features[20] = info->tonality;
512 features[21] = info->activity;
513 features[22] = frame_stationarity;
514 features[23] = info->tonality_slope;
515 features[24] = tonal->lowECount;
516
517 #ifndef DISABLE_FLOAT_API
518 mlp_process(&net, features, frame_probs);
519 frame_probs[0] = .5f*(frame_probs[0]+1);
520 /* Curve fitting between the MLP probability and the actual probability */
521 frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
522 /* Probability of active audio (as opposed to silence) */
523 frame_probs[1] = .5f*frame_probs[1]+.5f;
524 /* Consider that silence has a 50-50 probability. */
525 frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
526
527 /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
528 {
529 /* Probability of state transition */
530 float tau;
531 /* Represents independence of the MLP probabilities, where
532 beta=1 means fully independent. */
533 float beta;
534 /* Denormalized probability of speech (p0) and music (p1) after update */
535 float p0, p1;
536 /* Probabilities for "all speech" and "all music" */
537 float s0, m0;
538 /* Probability sum for renormalisation */
539 float psum;
540 /* Instantaneous probability of speech and music, with beta pre-applied. */
541 float speech0;
542 float music0;
543 float p, q;
544
545 /* One transition every 3 minutes of active audio */
546 tau = .00005f*frame_probs[1];
547 /* Adapt beta based on how "unexpected" the new prob is */
548 p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
549 q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
550 beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
551 /* p0 and p1 are the probabilities of speech and music at this frame
552 using only information from previous frame and applying the
553 state transition model */
554 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
555 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
556 /* We apply the current probability with exponent beta to work around
557 the fact that the probability estimates aren't independent. */
558 p0 *= (float)pow(1-frame_probs[0], beta);
559 p1 *= (float)pow(frame_probs[0], beta);
560 /* Normalise the probabilities to get the Marokv probability of music. */
561 tonal->music_prob = p1/(p0+p1);
562 info->music_prob = tonal->music_prob;
563
564 /* This chunk of code deals with delayed decision. */
565 psum=1e-20f;
566 /* Instantaneous probability of speech and music, with beta pre-applied. */
567 speech0 = (float)pow(1-frame_probs[0], beta);
568 music0 = (float)pow(frame_probs[0], beta);
569 if (tonal->count==1)
570 {
571 tonal->pspeech[0]=.5;
572 tonal->pmusic [0]=.5;
573 }
574 /* Updated probability of having only speech (s0) or only music (m0),
575 before considering the new observation. */
576 s0 = tonal->pspeech[0] + tonal->pspeech[1];
577 m0 = tonal->pmusic [0] + tonal->pmusic [1];
578 /* Updates s0 and m0 with instantaneous probability. */
579 tonal->pspeech[0] = s0*(1-tau)*speech0;
580 tonal->pmusic [0] = m0*(1-tau)*music0;
581 /* Propagate the transition probabilities */
582 for (i=1;i<DETECT_SIZE-1;i++)
583 {
584 tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
585 tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
586 }
587 /* Probability that the latest frame is speech, when all the previous ones were music. */
588 tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
589 /* Probability that the latest frame is music, when all the previous ones were speech. */
590 tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
591
592 /* Renormalise probabilities to 1 */
593 for (i=0;i<DETECT_SIZE;i++)
594 psum += tonal->pspeech[i] + tonal->pmusic[i];
595 psum = 1.f/psum;
596 for (i=0;i<DETECT_SIZE;i++)
597 {
598 tonal->pspeech[i] *= psum;
599 tonal->pmusic [i] *= psum;
600 }
601 psum = tonal->pmusic[0];
602 for (i=1;i<DETECT_SIZE;i++)
603 psum += tonal->pspeech[i];
604
605 /* Estimate our confidence in the speech/music decisions */
606 if (frame_probs[1]>.75)
607 {
608 if (tonal->music_prob>.9)
609 {
610 float adapt;
611 adapt = 1.f/(++tonal->music_confidence_count);
612 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
613 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
614 }
615 if (tonal->music_prob<.1)
616 {
617 float adapt;
618 adapt = 1.f/(++tonal->speech_confidence_count);
619 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
620 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
621 }
622 } else {
623 if (tonal->music_confidence_count==0)
624 tonal->music_confidence = .9f;
625 if (tonal->speech_confidence_count==0)
626 tonal->speech_confidence = .1f;
627 }
628 }
629 if (tonal->last_music != (tonal->music_prob>.5f))
630 tonal->last_transition=0;
631 tonal->last_music = tonal->music_prob>.5f;
632 #else
633 info->music_prob = 0;
634 #endif
635 /*for (i=0;i<25;i++)
636 printf("%f ", features[i]);
637 printf("\n");*/
638
639 info->bandwidth = bandwidth;
640 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
641 info->noisiness = frame_noisiness;
642 info->valid = 1;
643 RESTORE_STACK;
644 }
645
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)646 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
647 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
648 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
649 {
650 int offset;
651 int pcm_len;
652
653 if (analysis_pcm != NULL)
654 {
655 /* Avoid overflow/wrap-around of the analysis buffer */
656 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
657
658 pcm_len = analysis_frame_size - analysis->analysis_offset;
659 offset = analysis->analysis_offset;
660 do {
661 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
662 offset += 480;
663 pcm_len -= 480;
664 } while (pcm_len>0);
665 analysis->analysis_offset = analysis_frame_size;
666
667 analysis->analysis_offset -= frame_size;
668 }
669
670 analysis_info->valid = 0;
671 tonality_get_info(analysis, analysis_info, frame_size);
672 }
673