1 /* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3 /*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41
42 extern const MLP net;
43
44 #ifndef M_PI
45 #define M_PI 3.141592653
46 #endif
47
48 static const float dct_table[128] = {
49 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
50 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
51 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
52 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
53 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
54 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
55 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
56 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
57 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
58 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
59 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
60 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
61 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
62 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
63 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
64 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
65 };
66
67 static const float analysis_window[240] = {
68 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
69 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
70 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
71 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
72 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
73 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
74 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
75 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
76 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
77 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
78 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
79 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
80 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
81 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
82 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
83 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
84 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
85 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
86 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
87 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
88 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
89 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
90 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
91 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
92 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
93 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
94 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
95 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
96 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
97 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
98 };
99
100 static const int tbands[NB_TBANDS+1] = {
101 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
102 };
103
104 static const int extra_bands[NB_TOT_BANDS+1] = {
105 1, 2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
106 };
107
108 /*static const float tweight[NB_TBANDS+1] = {
109 .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
110 };*/
111
112 #define NB_TONAL_SKIP_BANDS 9
113
114 #define cA 0.43157974f
115 #define cB 0.67848403f
116 #define cC 0.08595542f
117 #define cE ((float)M_PI/2)
fast_atan2f(float y,float x)118 static OPUS_INLINE float fast_atan2f(float y, float x) {
119 float x2, y2;
120 /* Should avoid underflow on the values we'll get */
121 if (ABS16(x)+ABS16(y)<1e-9f)
122 {
123 x*=1e12f;
124 y*=1e12f;
125 }
126 x2 = x*x;
127 y2 = y*y;
128 if(x2<y2){
129 float den = (y2 + cB*x2) * (y2 + cC*x2);
130 if (den!=0)
131 return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
132 else
133 return (y<0 ? -cE : cE);
134 }else{
135 float den = (x2 + cB*y2) * (x2 + cC*y2);
136 if (den!=0)
137 return x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138 else
139 return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
140 }
141 }
142
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)143 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
144 {
145 int pos;
146 int curr_lookahead;
147 float psum;
148 int i;
149
150 pos = tonal->read_pos;
151 curr_lookahead = tonal->write_pos-tonal->read_pos;
152 if (curr_lookahead<0)
153 curr_lookahead += DETECT_SIZE;
154
155 if (len > 480 && pos != tonal->write_pos)
156 {
157 pos++;
158 if (pos==DETECT_SIZE)
159 pos=0;
160 }
161 if (pos == tonal->write_pos)
162 pos--;
163 if (pos<0)
164 pos = DETECT_SIZE-1;
165 OPUS_COPY(info_out, &tonal->info[pos], 1);
166 tonal->read_subframe += len/120;
167 while (tonal->read_subframe>=4)
168 {
169 tonal->read_subframe -= 4;
170 tonal->read_pos++;
171 }
172 if (tonal->read_pos>=DETECT_SIZE)
173 tonal->read_pos-=DETECT_SIZE;
174
175 /* Compensate for the delay in the features themselves.
176 FIXME: Need a better estimate the 10 I just made up */
177 curr_lookahead = IMAX(curr_lookahead-10, 0);
178
179 psum=0;
180 /* Summing the probability of transition patterns that involve music at
181 time (DETECT_SIZE-curr_lookahead-1) */
182 for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
183 psum += tonal->pmusic[i];
184 for (;i<DETECT_SIZE;i++)
185 psum += tonal->pspeech[i];
186 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
187 /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
188
189 info_out->music_prob = psum;
190 }
191
tonality_analysis(TonalityAnalysisState * tonal,AnalysisInfo * info_out,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)192 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
193 {
194 int i, b;
195 const kiss_fft_state *kfft;
196 VARDECL(kiss_fft_cpx, in);
197 VARDECL(kiss_fft_cpx, out);
198 int N = 480, N2=240;
199 float * OPUS_RESTRICT A = tonal->angle;
200 float * OPUS_RESTRICT dA = tonal->d_angle;
201 float * OPUS_RESTRICT d2A = tonal->d2_angle;
202 VARDECL(float, tonality);
203 VARDECL(float, noisiness);
204 float band_tonality[NB_TBANDS];
205 float logE[NB_TBANDS];
206 float BFCC[8];
207 float features[25];
208 float frame_tonality;
209 float max_frame_tonality;
210 /*float tw_sum=0;*/
211 float frame_noisiness;
212 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
213 float slope=0;
214 float frame_stationarity;
215 float relativeE;
216 float frame_probs[2];
217 float alpha, alphaE, alphaE2;
218 float frame_loudness;
219 float bandwidth_mask;
220 int bandwidth=0;
221 float maxE = 0;
222 float noise_floor;
223 int remaining;
224 AnalysisInfo *info;
225 SAVE_STACK;
226
227 tonal->last_transition++;
228 alpha = 1.f/IMIN(20, 1+tonal->count);
229 alphaE = 1.f/IMIN(50, 1+tonal->count);
230 alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
231
232 if (tonal->count<4)
233 tonal->music_prob = .5;
234 kfft = celt_mode->mdct.kfft[0];
235 if (tonal->count==0)
236 tonal->mem_fill = 240;
237 downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
238 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
239 {
240 tonal->mem_fill += len;
241 /* Don't have enough to update the analysis */
242 RESTORE_STACK;
243 return;
244 }
245 info = &tonal->info[tonal->write_pos++];
246 if (tonal->write_pos>=DETECT_SIZE)
247 tonal->write_pos-=DETECT_SIZE;
248
249 ALLOC(in, 480, kiss_fft_cpx);
250 ALLOC(out, 480, kiss_fft_cpx);
251 ALLOC(tonality, 240, float);
252 ALLOC(noisiness, 240, float);
253 for (i=0;i<N2;i++)
254 {
255 float w = analysis_window[i];
256 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
257 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
258 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
259 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
260 }
261 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
262 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
263 downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
264 tonal->mem_fill = 240 + remaining;
265 opus_fft(kfft, in, out);
266
267 for (i=1;i<N2;i++)
268 {
269 float X1r, X2r, X1i, X2i;
270 float angle, d_angle, d2_angle;
271 float angle2, d_angle2, d2_angle2;
272 float mod1, mod2, avg_mod;
273 X1r = (float)out[i].r+out[N-i].r;
274 X1i = (float)out[i].i-out[N-i].i;
275 X2r = (float)out[i].i+out[N-i].i;
276 X2i = (float)out[N-i].r-out[i].r;
277
278 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
279 d_angle = angle - A[i];
280 d2_angle = d_angle - dA[i];
281
282 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
283 d_angle2 = angle2 - angle;
284 d2_angle2 = d_angle2 - d_angle;
285
286 mod1 = d2_angle - (float)floor(.5+d2_angle);
287 noisiness[i] = ABS16(mod1);
288 mod1 *= mod1;
289 mod1 *= mod1;
290
291 mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
292 noisiness[i] += ABS16(mod2);
293 mod2 *= mod2;
294 mod2 *= mod2;
295
296 avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
297 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
298
299 A[i] = angle2;
300 dA[i] = d_angle2;
301 d2A[i] = mod2;
302 }
303
304 frame_tonality = 0;
305 max_frame_tonality = 0;
306 /*tw_sum = 0;*/
307 info->activity = 0;
308 frame_noisiness = 0;
309 frame_stationarity = 0;
310 if (!tonal->count)
311 {
312 for (b=0;b<NB_TBANDS;b++)
313 {
314 tonal->lowE[b] = 1e10;
315 tonal->highE[b] = -1e10;
316 }
317 }
318 relativeE = 0;
319 frame_loudness = 0;
320 for (b=0;b<NB_TBANDS;b++)
321 {
322 float E=0, tE=0, nE=0;
323 float L1, L2;
324 float stationarity;
325 for (i=tbands[b];i<tbands[b+1];i++)
326 {
327 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
328 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
329 #ifdef FIXED_POINT
330 /* FIXME: It's probably best to change the BFCC filter initial state instead */
331 binE *= 5.55e-17f;
332 #endif
333 E += binE;
334 tE += binE*tonality[i];
335 nE += binE*2.f*(.5f-noisiness[i]);
336 }
337 tonal->E[tonal->E_count][b] = E;
338 frame_noisiness += nE/(1e-15f+E);
339
340 frame_loudness += (float)sqrt(E+1e-10f);
341 logE[b] = (float)log(E+1e-10f);
342 tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
343 tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
344 if (tonal->highE[b] < tonal->lowE[b]+1.f)
345 {
346 tonal->highE[b]+=.5f;
347 tonal->lowE[b]-=.5f;
348 }
349 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
350
351 L1=L2=0;
352 for (i=0;i<NB_FRAMES;i++)
353 {
354 L1 += (float)sqrt(tonal->E[i][b]);
355 L2 += tonal->E[i][b];
356 }
357
358 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
359 stationarity *= stationarity;
360 stationarity *= stationarity;
361 frame_stationarity += stationarity;
362 /*band_tonality[b] = tE/(1e-15+E)*/;
363 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
364 #if 0
365 if (b>=NB_TONAL_SKIP_BANDS)
366 {
367 frame_tonality += tweight[b]*band_tonality[b];
368 tw_sum += tweight[b];
369 }
370 #else
371 frame_tonality += band_tonality[b];
372 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
373 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
374 #endif
375 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
376 slope += band_tonality[b]*(b-8);
377 /*printf("%f %f ", band_tonality[b], stationarity);*/
378 tonal->prev_band_tonality[b] = band_tonality[b];
379 }
380
381 bandwidth_mask = 0;
382 bandwidth = 0;
383 maxE = 0;
384 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
385 #ifdef FIXED_POINT
386 noise_floor *= 1<<(15+SIG_SHIFT);
387 #endif
388 noise_floor *= noise_floor;
389 for (b=0;b<NB_TOT_BANDS;b++)
390 {
391 float E=0;
392 int band_start, band_end;
393 /* Keep a margin of 300 Hz for aliasing */
394 band_start = extra_bands[b];
395 band_end = extra_bands[b+1];
396 for (i=band_start;i<band_end;i++)
397 {
398 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
399 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
400 E += binE;
401 }
402 maxE = MAX32(maxE, E);
403 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
404 E = MAX32(E, tonal->meanE[b]);
405 /* Use a simple follower with 13 dB/Bark slope for spreading function */
406 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
407 /* Consider the band "active" only if all these conditions are met:
408 1) less than 10 dB below the simple follower
409 2) less than 90 dB below the peak band (maximal masking possible considering
410 both the ATH and the loudness-dependent slope of the spreading function)
411 3) above the PCM quantization noise floor
412 */
413 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
414 bandwidth = b;
415 }
416 if (tonal->count<=2)
417 bandwidth = 20;
418 frame_loudness = 20*(float)log10(frame_loudness);
419 tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
420 tonal->lowECount *= (1-alphaE);
421 if (frame_loudness < tonal->Etracker-30)
422 tonal->lowECount += alphaE;
423
424 for (i=0;i<8;i++)
425 {
426 float sum=0;
427 for (b=0;b<16;b++)
428 sum += dct_table[i*16+b]*logE[b];
429 BFCC[i] = sum;
430 }
431
432 frame_stationarity /= NB_TBANDS;
433 relativeE /= NB_TBANDS;
434 if (tonal->count<10)
435 relativeE = .5;
436 frame_noisiness /= NB_TBANDS;
437 #if 1
438 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
439 #else
440 info->activity = .5*(1+frame_noisiness-frame_stationarity);
441 #endif
442 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
443 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
444 tonal->prev_tonality = frame_tonality;
445
446 slope /= 8*8;
447 info->tonality_slope = slope;
448
449 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
450 tonal->count++;
451 info->tonality = frame_tonality;
452
453 for (i=0;i<4;i++)
454 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
455
456 for (i=0;i<4;i++)
457 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
458
459 for (i=0;i<4;i++)
460 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
461 for (i=0;i<3;i++)
462 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
463
464 if (tonal->count > 5)
465 {
466 for (i=0;i<9;i++)
467 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
468 }
469
470 for (i=0;i<8;i++)
471 {
472 tonal->mem[i+24] = tonal->mem[i+16];
473 tonal->mem[i+16] = tonal->mem[i+8];
474 tonal->mem[i+8] = tonal->mem[i];
475 tonal->mem[i] = BFCC[i];
476 }
477 for (i=0;i<9;i++)
478 features[11+i] = (float)sqrt(tonal->std[i]);
479 features[20] = info->tonality;
480 features[21] = info->activity;
481 features[22] = frame_stationarity;
482 features[23] = info->tonality_slope;
483 features[24] = tonal->lowECount;
484
485 #ifndef DISABLE_FLOAT_API
486 mlp_process(&net, features, frame_probs);
487 frame_probs[0] = .5f*(frame_probs[0]+1);
488 /* Curve fitting between the MLP probability and the actual probability */
489 frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
490 /* Probability of active audio (as opposed to silence) */
491 frame_probs[1] = .5f*frame_probs[1]+.5f;
492 /* Consider that silence has a 50-50 probability. */
493 frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
494
495 /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
496 {
497 /* Probability of state transition */
498 float tau;
499 /* Represents independence of the MLP probabilities, where
500 beta=1 means fully independent. */
501 float beta;
502 /* Denormalized probability of speech (p0) and music (p1) after update */
503 float p0, p1;
504 /* Probabilities for "all speech" and "all music" */
505 float s0, m0;
506 /* Probability sum for renormalisation */
507 float psum;
508 /* Instantaneous probability of speech and music, with beta pre-applied. */
509 float speech0;
510 float music0;
511
512 /* One transition every 3 minutes of active audio */
513 tau = .00005f*frame_probs[1];
514 beta = .05f;
515 if (1) {
516 /* Adapt beta based on how "unexpected" the new prob is */
517 float p, q;
518 p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
519 q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
520 beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
521 }
522 /* p0 and p1 are the probabilities of speech and music at this frame
523 using only information from previous frame and applying the
524 state transition model */
525 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
526 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
527 /* We apply the current probability with exponent beta to work around
528 the fact that the probability estimates aren't independent. */
529 p0 *= (float)pow(1-frame_probs[0], beta);
530 p1 *= (float)pow(frame_probs[0], beta);
531 /* Normalise the probabilities to get the Marokv probability of music. */
532 tonal->music_prob = p1/(p0+p1);
533 info->music_prob = tonal->music_prob;
534
535 /* This chunk of code deals with delayed decision. */
536 psum=1e-20f;
537 /* Instantaneous probability of speech and music, with beta pre-applied. */
538 speech0 = (float)pow(1-frame_probs[0], beta);
539 music0 = (float)pow(frame_probs[0], beta);
540 if (tonal->count==1)
541 {
542 tonal->pspeech[0]=.5;
543 tonal->pmusic [0]=.5;
544 }
545 /* Updated probability of having only speech (s0) or only music (m0),
546 before considering the new observation. */
547 s0 = tonal->pspeech[0] + tonal->pspeech[1];
548 m0 = tonal->pmusic [0] + tonal->pmusic [1];
549 /* Updates s0 and m0 with instantaneous probability. */
550 tonal->pspeech[0] = s0*(1-tau)*speech0;
551 tonal->pmusic [0] = m0*(1-tau)*music0;
552 /* Propagate the transition probabilities */
553 for (i=1;i<DETECT_SIZE-1;i++)
554 {
555 tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
556 tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
557 }
558 /* Probability that the latest frame is speech, when all the previous ones were music. */
559 tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
560 /* Probability that the latest frame is music, when all the previous ones were speech. */
561 tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
562
563 /* Renormalise probabilities to 1 */
564 for (i=0;i<DETECT_SIZE;i++)
565 psum += tonal->pspeech[i] + tonal->pmusic[i];
566 psum = 1.f/psum;
567 for (i=0;i<DETECT_SIZE;i++)
568 {
569 tonal->pspeech[i] *= psum;
570 tonal->pmusic [i] *= psum;
571 }
572 psum = tonal->pmusic[0];
573 for (i=1;i<DETECT_SIZE;i++)
574 psum += tonal->pspeech[i];
575
576 /* Estimate our confidence in the speech/music decisions */
577 if (frame_probs[1]>.75)
578 {
579 if (tonal->music_prob>.9)
580 {
581 float adapt;
582 adapt = 1.f/(++tonal->music_confidence_count);
583 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
584 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
585 }
586 if (tonal->music_prob<.1)
587 {
588 float adapt;
589 adapt = 1.f/(++tonal->speech_confidence_count);
590 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
591 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
592 }
593 } else {
594 if (tonal->music_confidence_count==0)
595 tonal->music_confidence = .9f;
596 if (tonal->speech_confidence_count==0)
597 tonal->speech_confidence = .1f;
598 }
599 }
600 if (tonal->last_music != (tonal->music_prob>.5f))
601 tonal->last_transition=0;
602 tonal->last_music = tonal->music_prob>.5f;
603 #else
604 info->music_prob = 0;
605 #endif
606 /*for (i=0;i<25;i++)
607 printf("%f ", features[i]);
608 printf("\n");*/
609
610 info->bandwidth = bandwidth;
611 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
612 info->noisiness = frame_noisiness;
613 info->valid = 1;
614 if (info_out!=NULL)
615 OPUS_COPY(info_out, info, 1);
616 RESTORE_STACK;
617 }
618
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)619 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
620 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
621 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
622 {
623 int offset;
624 int pcm_len;
625
626 if (analysis_pcm != NULL)
627 {
628 /* Avoid overflow/wrap-around of the analysis buffer */
629 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
630
631 pcm_len = analysis_frame_size - analysis->analysis_offset;
632 offset = analysis->analysis_offset;
633 do {
634 tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
635 offset += 480;
636 pcm_len -= 480;
637 } while (pcm_len>0);
638 analysis->analysis_offset = analysis_frame_size;
639
640 analysis->analysis_offset -= frame_size;
641 }
642
643 analysis_info->valid = 0;
644 tonality_get_info(analysis, analysis_info, frame_size);
645 }
646