• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41 
42 extern const MLP net;
43 
44 #ifndef M_PI
45 #define M_PI 3.141592653
46 #endif
47 
48 static const float dct_table[128] = {
49         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
50         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
51         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
52        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
53         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
54        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
55         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
56         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
57         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
58         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
59         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
60        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
61         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
62        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
63         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
64         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
65 };
66 
67 static const float analysis_window[240] = {
68       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
69       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
70       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
71       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
72       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
73       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
74       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
75       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
76       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
77       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
78       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
79       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
80       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
81       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
82       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
83       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
84       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
85       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
86       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
87       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
88       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
89       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
90       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
91       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
92       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
93       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
94       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
95       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
96       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
97       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
98 };
99 
100 static const int tbands[NB_TBANDS+1] = {
101        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
102 };
103 
104 static const int extra_bands[NB_TOT_BANDS+1] = {
105       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
106 };
107 
108 /*static const float tweight[NB_TBANDS+1] = {
109       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
110 };*/
111 
112 #define NB_TONAL_SKIP_BANDS 9
113 
114 #define cA 0.43157974f
115 #define cB 0.67848403f
116 #define cC 0.08595542f
117 #define cE ((float)M_PI/2)
fast_atan2f(float y,float x)118 static inline float fast_atan2f(float y, float x) {
119    float x2, y2;
120    /* Should avoid underflow on the values we'll get */
121    if (ABS16(x)+ABS16(y)<1e-9f)
122    {
123       x*=1e12f;
124       y*=1e12f;
125    }
126    x2 = x*x;
127    y2 = y*y;
128    if(x2<y2){
129       float den = (y2 + cB*x2) * (y2 + cC*x2);
130       if (den!=0)
131          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
132       else
133          return (y<0 ? -cE : cE);
134    }else{
135       float den = (x2 + cB*y2) * (x2 + cC*y2);
136       if (den!=0)
137          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138       else
139          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
140    }
141 }
142 
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)143 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
144 {
145    int pos;
146    int curr_lookahead;
147    float psum;
148    int i;
149 
150    pos = tonal->read_pos;
151    curr_lookahead = tonal->write_pos-tonal->read_pos;
152    if (curr_lookahead<0)
153       curr_lookahead += DETECT_SIZE;
154 
155    if (len > 480 && pos != tonal->write_pos)
156    {
157       pos++;
158       if (pos==DETECT_SIZE)
159          pos=0;
160    }
161    if (pos == tonal->write_pos)
162       pos--;
163    if (pos<0)
164       pos = DETECT_SIZE-1;
165    OPUS_COPY(info_out, &tonal->info[pos], 1);
166    tonal->read_subframe += len/120;
167    while (tonal->read_subframe>=4)
168    {
169       tonal->read_subframe -= 4;
170       tonal->read_pos++;
171    }
172    if (tonal->read_pos>=DETECT_SIZE)
173       tonal->read_pos-=DETECT_SIZE;
174 
175    /* Compensate for the delay in the features themselves.
176       FIXME: Need a better estimate the 10 I just made up */
177    curr_lookahead = IMAX(curr_lookahead-10, 0);
178 
179    psum=0;
180    /* Summing the probability of transition patterns that involve music at
181       time (DETECT_SIZE-curr_lookahead-1) */
182    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
183       psum += tonal->pmusic[i];
184    for (;i<DETECT_SIZE;i++)
185       psum += tonal->pspeech[i];
186    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
187    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
188 
189    info_out->music_prob = psum;
190 }
191 
tonality_analysis(TonalityAnalysisState * tonal,AnalysisInfo * info_out,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)192 void tonality_analysis(TonalityAnalysisState *tonal, AnalysisInfo *info_out, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
193 {
194     int i, b;
195     const kiss_fft_state *kfft;
196     VARDECL(kiss_fft_cpx, in);
197     VARDECL(kiss_fft_cpx, out);
198     int N = 480, N2=240;
199     float * OPUS_RESTRICT A = tonal->angle;
200     float * OPUS_RESTRICT dA = tonal->d_angle;
201     float * OPUS_RESTRICT d2A = tonal->d2_angle;
202     VARDECL(float, tonality);
203     VARDECL(float, noisiness);
204     float band_tonality[NB_TBANDS];
205     float logE[NB_TBANDS];
206     float BFCC[8];
207     float features[25];
208     float frame_tonality;
209     float max_frame_tonality;
210     /*float tw_sum=0;*/
211     float frame_noisiness;
212     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
213     float slope=0;
214     float frame_stationarity;
215     float relativeE;
216     float frame_probs[2];
217     float alpha, alphaE, alphaE2;
218     float frame_loudness;
219     float bandwidth_mask;
220     int bandwidth=0;
221     float maxE = 0;
222     float noise_floor;
223     int remaining;
224     AnalysisInfo *info;
225     SAVE_STACK;
226 
227     tonal->last_transition++;
228     alpha = 1.f/IMIN(20, 1+tonal->count);
229     alphaE = 1.f/IMIN(50, 1+tonal->count);
230     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
231 
232     if (tonal->count<4)
233        tonal->music_prob = .5;
234     kfft = celt_mode->mdct.kfft[0];
235     if (tonal->count==0)
236        tonal->mem_fill = 240;
237     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
238     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
239     {
240        tonal->mem_fill += len;
241        /* Don't have enough to update the analysis */
242        RESTORE_STACK;
243        return;
244     }
245     info = &tonal->info[tonal->write_pos++];
246     if (tonal->write_pos>=DETECT_SIZE)
247        tonal->write_pos-=DETECT_SIZE;
248 
249     ALLOC(in, 480, kiss_fft_cpx);
250     ALLOC(out, 480, kiss_fft_cpx);
251     ALLOC(tonality, 240, float);
252     ALLOC(noisiness, 240, float);
253     for (i=0;i<N2;i++)
254     {
255        float w = analysis_window[i];
256        in[i].r = w*tonal->inmem[i];
257        in[i].i = w*tonal->inmem[N2+i];
258        in[N-i-1].r = w*tonal->inmem[N-i-1];
259        in[N-i-1].i = w*tonal->inmem[N+N2-i-1];
260     }
261     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
262     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
263     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
264     tonal->mem_fill = 240 + remaining;
265     opus_fft(kfft, in, out);
266 
267     for (i=1;i<N2;i++)
268     {
269        float X1r, X2r, X1i, X2i;
270        float angle, d_angle, d2_angle;
271        float angle2, d_angle2, d2_angle2;
272        float mod1, mod2, avg_mod;
273        X1r = out[i].r+out[N-i].r;
274        X1i = out[i].i-out[N-i].i;
275        X2r = out[i].i+out[N-i].i;
276        X2i = out[N-i].r-out[i].r;
277 
278        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
279        d_angle = angle - A[i];
280        d2_angle = d_angle - dA[i];
281 
282        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
283        d_angle2 = angle2 - angle;
284        d2_angle2 = d_angle2 - d_angle;
285 
286        mod1 = d2_angle - (float)floor(.5+d2_angle);
287        noisiness[i] = ABS16(mod1);
288        mod1 *= mod1;
289        mod1 *= mod1;
290 
291        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
292        noisiness[i] += ABS16(mod2);
293        mod2 *= mod2;
294        mod2 *= mod2;
295 
296        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
297        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
298 
299        A[i] = angle2;
300        dA[i] = d_angle2;
301        d2A[i] = mod2;
302     }
303 
304     frame_tonality = 0;
305     max_frame_tonality = 0;
306     /*tw_sum = 0;*/
307     info->activity = 0;
308     frame_noisiness = 0;
309     frame_stationarity = 0;
310     if (!tonal->count)
311     {
312        for (b=0;b<NB_TBANDS;b++)
313        {
314           tonal->lowE[b] = 1e10;
315           tonal->highE[b] = -1e10;
316        }
317     }
318     relativeE = 0;
319     frame_loudness = 0;
320     bandwidth_mask = 0;
321     for (b=0;b<NB_TBANDS;b++)
322     {
323        float E=0, tE=0, nE=0;
324        float L1, L2;
325        float stationarity;
326        for (i=tbands[b];i<tbands[b+1];i++)
327        {
328           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
329                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
330 #ifdef FIXED_POINT
331           /* FIXME: It's probably best to change the BFCC filter initial state instead */
332           binE *= 5.55e-17f;
333 #endif
334           E += binE;
335           tE += binE*tonality[i];
336           nE += binE*2.f*(.5f-noisiness[i]);
337        }
338        tonal->E[tonal->E_count][b] = E;
339        frame_noisiness += nE/(1e-15f+E);
340 
341        frame_loudness += sqrt(E+1e-10f);
342        logE[b] = (float)log(E+1e-10f);
343        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
344        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
345        if (tonal->highE[b] < tonal->lowE[b]+1.f)
346        {
347           tonal->highE[b]+=.5f;
348           tonal->lowE[b]-=.5f;
349        }
350        relativeE += (logE[b]-tonal->lowE[b])/(1e-15+tonal->highE[b]-tonal->lowE[b]);
351 
352        L1=L2=0;
353        for (i=0;i<NB_FRAMES;i++)
354        {
355           L1 += sqrt(tonal->E[i][b]);
356           L2 += tonal->E[i][b];
357        }
358 
359        stationarity = MIN16(0.99f,L1/sqrt(1e-15+NB_FRAMES*L2));
360        stationarity *= stationarity;
361        stationarity *= stationarity;
362        frame_stationarity += stationarity;
363        /*band_tonality[b] = tE/(1e-15+E)*/;
364        band_tonality[b] = MAX16(tE/(1e-15+E), stationarity*tonal->prev_band_tonality[b]);
365 #if 0
366        if (b>=NB_TONAL_SKIP_BANDS)
367        {
368           frame_tonality += tweight[b]*band_tonality[b];
369           tw_sum += tweight[b];
370        }
371 #else
372        frame_tonality += band_tonality[b];
373        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
374           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
375 #endif
376        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
377        slope += band_tonality[b]*(b-8);
378        /*printf("%f %f ", band_tonality[b], stationarity);*/
379        tonal->prev_band_tonality[b] = band_tonality[b];
380     }
381 
382     bandwidth_mask = 0;
383     bandwidth = 0;
384     maxE = 0;
385     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
386 #ifdef FIXED_POINT
387     noise_floor *= 1<<(15+SIG_SHIFT);
388 #endif
389     noise_floor *= noise_floor;
390     for (b=0;b<NB_TOT_BANDS;b++)
391     {
392        float E=0;
393        int band_start, band_end;
394        /* Keep a margin of 300 Hz for aliasing */
395        band_start = extra_bands[b];
396        band_end = extra_bands[b+1];
397        for (i=band_start;i<band_end;i++)
398        {
399           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
400                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
401           E += binE;
402        }
403        maxE = MAX32(maxE, E);
404        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
405        E = MAX32(E, tonal->meanE[b]);
406        /* Use a simple follower with 13 dB/Bark slope for spreading function */
407        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
408        /* Consider the band "active" only if all these conditions are met:
409           1) less than 10 dB below the simple follower
410           2) less than 90 dB below the peak band (maximal masking possible considering
411              both the ATH and the loudness-dependent slope of the spreading function)
412           3) above the PCM quantization noise floor
413        */
414        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
415           bandwidth = b;
416     }
417     if (tonal->count<=2)
418        bandwidth = 20;
419     frame_loudness = 20*(float)log10(frame_loudness);
420     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
421     tonal->lowECount *= (1-alphaE);
422     if (frame_loudness < tonal->Etracker-30)
423        tonal->lowECount += alphaE;
424 
425     for (i=0;i<8;i++)
426     {
427        float sum=0;
428        for (b=0;b<16;b++)
429           sum += dct_table[i*16+b]*logE[b];
430        BFCC[i] = sum;
431     }
432 
433     frame_stationarity /= NB_TBANDS;
434     relativeE /= NB_TBANDS;
435     if (tonal->count<10)
436        relativeE = .5;
437     frame_noisiness /= NB_TBANDS;
438 #if 1
439     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
440 #else
441     info->activity = .5*(1+frame_noisiness-frame_stationarity);
442 #endif
443     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
444     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
445     tonal->prev_tonality = frame_tonality;
446 
447     slope /= 8*8;
448     info->tonality_slope = slope;
449 
450     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
451     tonal->count++;
452     info->tonality = frame_tonality;
453 
454     for (i=0;i<4;i++)
455        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
456 
457     for (i=0;i<4;i++)
458        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
459 
460     for (i=0;i<4;i++)
461         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
462     for (i=0;i<3;i++)
463         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
464 
465     if (tonal->count > 5)
466     {
467        for (i=0;i<9;i++)
468           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
469     }
470 
471     for (i=0;i<8;i++)
472     {
473        tonal->mem[i+24] = tonal->mem[i+16];
474        tonal->mem[i+16] = tonal->mem[i+8];
475        tonal->mem[i+8] = tonal->mem[i];
476        tonal->mem[i] = BFCC[i];
477     }
478     for (i=0;i<9;i++)
479        features[11+i] = sqrt(tonal->std[i]);
480     features[20] = info->tonality;
481     features[21] = info->activity;
482     features[22] = frame_stationarity;
483     features[23] = info->tonality_slope;
484     features[24] = tonal->lowECount;
485 
486 #ifndef DISABLE_FLOAT_API
487     mlp_process(&net, features, frame_probs);
488     frame_probs[0] = .5f*(frame_probs[0]+1);
489     /* Curve fitting between the MLP probability and the actual probability */
490     frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
491     /* Probability of active audio (as opposed to silence) */
492     frame_probs[1] = .5f*frame_probs[1]+.5f;
493     /* Consider that silence has a 50-50 probability. */
494     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
495 
496     /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
497     {
498        /* Probability of state transition */
499        float tau;
500        /* Represents independence of the MLP probabilities, where
501           beta=1 means fully independent. */
502        float beta;
503        /* Denormalized probability of speech (p0) and music (p1) after update */
504        float p0, p1;
505        /* Probabilities for "all speech" and "all music" */
506        float s0, m0;
507        /* Probability sum for renormalisation */
508        float psum;
509        /* Instantaneous probability of speech and music, with beta pre-applied. */
510        float speech0;
511        float music0;
512 
513        /* One transition every 3 minutes of active audio */
514        tau = .00005f*frame_probs[1];
515        beta = .05f;
516        if (1) {
517           /* Adapt beta based on how "unexpected" the new prob is */
518           float p, q;
519           p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
520           q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
521           beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
522        }
523        /* p0 and p1 are the probabilities of speech and music at this frame
524           using only information from previous frame and applying the
525           state transition model */
526        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
527        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
528        /* We apply the current probability with exponent beta to work around
529           the fact that the probability estimates aren't independent. */
530        p0 *= (float)pow(1-frame_probs[0], beta);
531        p1 *= (float)pow(frame_probs[0], beta);
532        /* Normalise the probabilities to get the Marokv probability of music. */
533        tonal->music_prob = p1/(p0+p1);
534        info->music_prob = tonal->music_prob;
535 
536        /* This chunk of code deals with delayed decision. */
537        psum=1e-20f;
538        /* Instantaneous probability of speech and music, with beta pre-applied. */
539        speech0 = (float)pow(1-frame_probs[0], beta);
540        music0  = (float)pow(frame_probs[0], beta);
541        if (tonal->count==1)
542        {
543           tonal->pspeech[0]=.5;
544           tonal->pmusic [0]=.5;
545        }
546        /* Updated probability of having only speech (s0) or only music (m0),
547           before considering the new observation. */
548        s0 = tonal->pspeech[0] + tonal->pspeech[1];
549        m0 = tonal->pmusic [0] + tonal->pmusic [1];
550        /* Updates s0 and m0 with instantaneous probability. */
551        tonal->pspeech[0] = s0*(1-tau)*speech0;
552        tonal->pmusic [0] = m0*(1-tau)*music0;
553        /* Propagate the transition probabilities */
554        for (i=1;i<DETECT_SIZE-1;i++)
555        {
556           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
557           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
558        }
559        /* Probability that the latest frame is speech, when all the previous ones were music. */
560        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
561        /* Probability that the latest frame is music, when all the previous ones were speech. */
562        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
563 
564        /* Renormalise probabilities to 1 */
565        for (i=0;i<DETECT_SIZE;i++)
566           psum += tonal->pspeech[i] + tonal->pmusic[i];
567        psum = 1.f/psum;
568        for (i=0;i<DETECT_SIZE;i++)
569        {
570           tonal->pspeech[i] *= psum;
571           tonal->pmusic [i] *= psum;
572        }
573        psum = tonal->pmusic[0];
574        for (i=1;i<DETECT_SIZE;i++)
575           psum += tonal->pspeech[i];
576 
577        /* Estimate our confidence in the speech/music decisions */
578        if (frame_probs[1]>.75)
579        {
580           if (tonal->music_prob>.9)
581           {
582              float adapt;
583              adapt = 1.f/(++tonal->music_confidence_count);
584              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
585              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
586           }
587           if (tonal->music_prob<.1)
588           {
589              float adapt;
590              adapt = 1.f/(++tonal->speech_confidence_count);
591              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
592              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
593           }
594        } else {
595           if (tonal->music_confidence_count==0)
596              tonal->music_confidence = .9f;
597           if (tonal->speech_confidence_count==0)
598              tonal->speech_confidence = .1f;
599        }
600        psum = MAX16(tonal->speech_confidence, MIN16(tonal->music_confidence, psum));
601     }
602     if (tonal->last_music != (tonal->music_prob>.5f))
603        tonal->last_transition=0;
604     tonal->last_music = tonal->music_prob>.5f;
605 #else
606     info->music_prob = 0;
607 #endif
608     /*for (i=0;i<25;i++)
609        printf("%f ", features[i]);
610     printf("\n");*/
611 
612     info->bandwidth = bandwidth;
613     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
614     info->noisiness = frame_noisiness;
615     info->valid = 1;
616     if (info_out!=NULL)
617        OPUS_COPY(info_out, info, 1);
618     RESTORE_STACK;
619 }
620 
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)621 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
622                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
623                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
624 {
625    int offset;
626    int pcm_len;
627 
628    if (analysis_pcm != NULL)
629    {
630       /* Avoid overflow/wrap-around of the analysis buffer */
631       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
632 
633       pcm_len = analysis_frame_size - analysis->analysis_offset;
634       offset = analysis->analysis_offset;
635       do {
636          tonality_analysis(analysis, NULL, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
637          offset += 480;
638          pcm_len -= 480;
639       } while (pcm_len>0);
640       analysis->analysis_offset = analysis_frame_size;
641 
642       analysis->analysis_offset -= frame_size;
643    }
644 
645    analysis_info->valid = 0;
646    tonality_get_info(analysis, analysis_info, frame_size);
647 }
648