1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #define ANALYSIS_C
33 
34 #ifdef MLP_TRAINING
35 #include <stdio.h>
36 #endif
37 
38 #include "mathops.h"
39 #include "kiss_fft.h"
40 #include "celt.h"
41 #include "modes.h"
42 #include "arch.h"
43 #include "quant_bands.h"
44 #include "analysis.h"
45 #include "mlp.h"
46 #include "stack_alloc.h"
47 #include "float_cast.h"
48 
49 #ifndef M_PI
50 #define M_PI 3.141592653
51 #endif
52 
53 #ifndef DISABLE_FLOAT_API
54 
55 #define TRANSITION_PENALTY 10
56 
57 static const float dct_table[128] = {
58         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
59         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
60         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
61        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
62         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
63        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
64         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
65         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
66         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
67         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
68         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
69        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
70         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
71        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
72         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
73         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
74 };
75 
76 static const float analysis_window[240] = {
77       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
78       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
79       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
80       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
81       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
82       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
83       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
84       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
85       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
86       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
87       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
88       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
89       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
90       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
91       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
92       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
93       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
94       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
95       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
96       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
97       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
98       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
99       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
100       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
101       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
102       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
103       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
104       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
105       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
106       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
107 };
108 
109 static const int tbands[NB_TBANDS+1] = {
110       4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
111 };
112 
113 #define NB_TONAL_SKIP_BANDS 9
114 
silk_resampler_down2_hp(opus_val32 * S,opus_val32 * out,const opus_val32 * in,int inLen)115 static opus_val32 silk_resampler_down2_hp(
116     opus_val32                  *S,                 /* I/O  State vector [ 2 ]                                          */
117     opus_val32                  *out,               /* O    Output signal [ floor(len/2) ]                              */
118     const opus_val32            *in,                /* I    Input signal [ len ]                                        */
119     int                         inLen               /* I    Number of input samples                                     */
120 )
121 {
122     int k, len2 = inLen/2;
123     opus_val32 in32, out32, out32_hp, Y, X;
124     opus_val64 hp_ener = 0;
125     /* Internal variables and state are in Q10 format */
126     for( k = 0; k < len2; k++ ) {
127         /* Convert to Q10 */
128         in32 = in[ 2 * k ];
129 
130         /* All-pass section for even input sample */
131         Y      = SUB32( in32, S[ 0 ] );
132         X      = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
133         out32  = ADD32( S[ 0 ], X );
134         S[ 0 ] = ADD32( in32, X );
135         out32_hp = out32;
136         /* Convert to Q10 */
137         in32 = in[ 2 * k + 1 ];
138 
139         /* All-pass section for odd input sample, and add to output of previous section */
140         Y      = SUB32( in32, S[ 1 ] );
141         X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
142         out32  = ADD32( out32, S[ 1 ] );
143         out32  = ADD32( out32, X );
144         S[ 1 ] = ADD32( in32, X );
145 
146         Y      = SUB32( -in32, S[ 2 ] );
147         X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
148         out32_hp  = ADD32( out32_hp, S[ 2 ] );
149         out32_hp  = ADD32( out32_hp, X );
150         S[ 2 ] = ADD32( -in32, X );
151 
152         if(__builtin_add_overflow(hp_ener, out32_hp*(opus_val64)out32_hp, &hp_ener))
153         {
154            hp_ener = UINT64_MAX;
155         }
156         /* Add, convert back to int16 and store to output */
157         out[ k ] = HALF32(out32);
158     }
159 #ifdef FIXED_POINT
160     /* len2 can be up to 480, so we shift by 8 more to make it fit. */
161     hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
162 #endif
163     return (opus_val32)hp_ener;
164 }
165 
downmix_and_resample(downmix_func downmix,const void * _x,opus_val32 * y,opus_val32 S[3],int subframe,int offset,int c1,int c2,int C,int Fs)166 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
167 {
168    VARDECL(opus_val32, tmp);
169    opus_val32 scale;
170    int j;
171    opus_val32 ret = 0;
172    SAVE_STACK;
173 
174    if (subframe==0) return 0;
175    if (Fs == 48000)
176    {
177       subframe *= 2;
178       offset *= 2;
179    } else if (Fs == 16000) {
180       subframe = subframe*2/3;
181       offset = offset*2/3;
182    }
183    ALLOC(tmp, subframe, opus_val32);
184 
185    downmix(_x, tmp, subframe, offset, c1, c2, C);
186 #ifdef FIXED_POINT
187    scale = (1<<SIG_SHIFT);
188 #else
189    scale = 1.f/32768;
190 #endif
191    if (c2==-2)
192       scale /= C;
193    else if (c2>-1)
194       scale /= 2;
195    for (j=0;j<subframe;j++)
196       tmp[j] *= scale;
197    if (Fs == 48000)
198    {
199       ret = silk_resampler_down2_hp(S, y, tmp, subframe);
200    } else if (Fs == 24000) {
201       OPUS_COPY(y, tmp, subframe);
202    } else if (Fs == 16000) {
203       VARDECL(opus_val32, tmp3x);
204       ALLOC(tmp3x, 3*subframe, opus_val32);
205       /* Don't do this at home! This resampler is horrible and it's only (barely)
206          usable for the purpose of the analysis because we don't care about all
207          the aliasing between 8 kHz and 12 kHz. */
208       for (j=0;j<subframe;j++)
209       {
210          tmp3x[3*j] = tmp[j];
211          tmp3x[3*j+1] = tmp[j];
212          tmp3x[3*j+2] = tmp[j];
213       }
214       silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
215    }
216    RESTORE_STACK;
217    return ret;
218 }
219 
tonality_analysis_init(TonalityAnalysisState * tonal,opus_int32 Fs)220 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
221 {
222   /* Initialize reusable fields. */
223   tonal->arch = opus_select_arch();
224   tonal->Fs = Fs;
225   /* Clear remaining fields. */
226   tonality_analysis_reset(tonal);
227 }
228 
tonality_analysis_reset(TonalityAnalysisState * tonal)229 void tonality_analysis_reset(TonalityAnalysisState *tonal)
230 {
231   /* Clear non-reusable fields. */
232   char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
233   OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
234 }
235 
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)236 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
237 {
238    int pos;
239    int curr_lookahead;
240    float tonality_max;
241    float tonality_avg;
242    int tonality_count;
243    int i;
244    int pos0;
245    float prob_avg;
246    float prob_count;
247    float prob_min, prob_max;
248    float vad_prob;
249    int mpos, vpos;
250    int bandwidth_span;
251 
252    pos = tonal->read_pos;
253    curr_lookahead = tonal->write_pos-tonal->read_pos;
254    if (curr_lookahead<0)
255       curr_lookahead += DETECT_SIZE;
256 
257    tonal->read_subframe += len/(tonal->Fs/400);
258    while (tonal->read_subframe>=8)
259    {
260       tonal->read_subframe -= 8;
261       tonal->read_pos++;
262    }
263    if (tonal->read_pos>=DETECT_SIZE)
264       tonal->read_pos-=DETECT_SIZE;
265 
266    /* On long frames, look at the second analysis window rather than the first. */
267    if (len > tonal->Fs/50 && pos != tonal->write_pos)
268    {
269       pos++;
270       if (pos==DETECT_SIZE)
271          pos=0;
272    }
273    if (pos == tonal->write_pos)
274       pos--;
275    if (pos<0)
276       pos = DETECT_SIZE-1;
277    pos0 = pos;
278    OPUS_COPY(info_out, &tonal->info[pos], 1);
279    if (!info_out->valid)
280       return;
281    tonality_max = tonality_avg = info_out->tonality;
282    tonality_count = 1;
283    /* Look at the neighbouring frames and pick largest bandwidth found (to be safe). */
284    bandwidth_span = 6;
285    /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
286    for (i=0;i<3;i++)
287    {
288       pos++;
289       if (pos==DETECT_SIZE)
290          pos = 0;
291       if (pos == tonal->write_pos)
292          break;
293       tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
294       tonality_avg += tonal->info[pos].tonality;
295       tonality_count++;
296       info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
297       bandwidth_span--;
298    }
299    pos = pos0;
300    /* Look back in time to see if any has a wider bandwidth than the current frame. */
301    for (i=0;i<bandwidth_span;i++)
302    {
303       pos--;
304       if (pos < 0)
305          pos = DETECT_SIZE-1;
306       if (pos == tonal->write_pos)
307          break;
308       info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
309    }
310    info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
311 
312    mpos = vpos = pos0;
313    /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and
314       ~1 frame delay in the VAD prob. */
315    if (curr_lookahead > 15)
316    {
317       mpos += 5;
318       if (mpos>=DETECT_SIZE)
319          mpos -= DETECT_SIZE;
320       vpos += 1;
321       if (vpos>=DETECT_SIZE)
322          vpos -= DETECT_SIZE;
323    }
324 
325    /* The following calculations attempt to minimize a "badness function"
326       for the transition. When switching from speech to music, the badness
327       of switching at frame k is
328       b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
329       where
330       v_i is the activity probability (VAD) at frame i,
331       p_i is the music probability at frame i
332       T is the probability threshold for switching
333       S is the penalty for switching during active audio rather than silence
334       the current frame has index i=0
335 
336       Rather than apply badness to directly decide when to switch, what we compute
337       instead is the threshold for which the optimal switching point is now. When
338       considering whether to switch now (frame 0) or at frame k, we have:
339       S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
340       which gives us:
341       T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i )
342       We take the min threshold across all positive values of k (up to the maximum
343       amount of lookahead we have) to give us the threshold for which the current
344       frame is the optimal switch point.
345 
346       The last step is that we need to consider whether we want to switch at all.
347       For that we use the average of the music probability over the entire window.
348       If the threshold is higher than that average we're not going to
349       switch, so we compute a min with the average as well. The result of all these
350       min operations is music_prob_min, which gives the threshold for switching to music
351       if we're currently encoding for speech.
352 
353       We do the exact opposite to compute music_prob_max which is used for switching
354       from music to speech.
355     */
356    prob_min = 1.f;
357    prob_max = 0.f;
358    vad_prob = tonal->info[vpos].activity_probability;
359    prob_count = MAX16(.1f, vad_prob);
360    prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob;
361    while (1)
362    {
363       float pos_vad;
364       mpos++;
365       if (mpos==DETECT_SIZE)
366          mpos = 0;
367       if (mpos == tonal->write_pos)
368          break;
369       vpos++;
370       if (vpos==DETECT_SIZE)
371          vpos = 0;
372       if (vpos == tonal->write_pos)
373          break;
374       pos_vad = tonal->info[vpos].activity_probability;
375       prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min);
376       prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max);
377       prob_count += MAX16(.1f, pos_vad);
378       prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob;
379    }
380    info_out->music_prob = prob_avg/prob_count;
381    prob_min = MIN16(prob_avg/prob_count, prob_min);
382    prob_max = MAX16(prob_avg/prob_count, prob_max);
383    prob_min = MAX16(prob_min, 0.f);
384    prob_max = MIN16(prob_max, 1.f);
385 
386    /* If we don't have enough look-ahead, do our best to make a decent decision. */
387    if (curr_lookahead < 10)
388    {
389       float pmin, pmax;
390       pmin = prob_min;
391       pmax = prob_max;
392       pos = pos0;
393       /* Look for min/max in the past. */
394       for (i=0;i<IMIN(tonal->count-1, 15);i++)
395       {
396          pos--;
397          if (pos < 0)
398             pos = DETECT_SIZE-1;
399          pmin = MIN16(pmin, tonal->info[pos].music_prob);
400          pmax = MAX16(pmax, tonal->info[pos].music_prob);
401       }
402       /* Bias against switching on active audio. */
403       pmin = MAX16(0.f, pmin - .1f*vad_prob);
404       pmax = MIN16(1.f, pmax + .1f*vad_prob);
405       prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min);
406       prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max);
407    }
408    info_out->music_prob_min = prob_min;
409    info_out->music_prob_max = prob_max;
410 
411    /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
412 }
413 
414 static const float std_feature_bias[9] = {
415       5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
416       2.163313f, 1.260756f, 1.116868f, 1.918795f
417 };
418 
419 #define LEAKAGE_OFFSET 2.5f
420 #define LEAKAGE_SLOPE 2.f
421 
422 #ifdef FIXED_POINT
423 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
424    compensate for that in the energy. */
425 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
426 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
427 #else
428 #define SCALE_ENER(e) (e)
429 #endif
430 
431 #ifdef FIXED_POINT
is_digital_silence32(const opus_val32 * pcm,int frame_size,int channels,int lsb_depth)432 static int is_digital_silence32(const opus_val32* pcm, int frame_size, int channels, int lsb_depth)
433 {
434    int silence = 0;
435    opus_val32 sample_max = 0;
436 #ifdef MLP_TRAINING
437    return 0;
438 #endif
439    sample_max = celt_maxabs32(pcm, frame_size*channels);
440 
441    silence = (sample_max == 0);
442    (void)lsb_depth;
443    return silence;
444 }
445 #else
446 #define is_digital_silence32(pcm, frame_size, channels, lsb_depth) is_digital_silence(pcm, frame_size, channels, lsb_depth)
447 #endif
448 
tonality_analysis(TonalityAnalysisState * tonal,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)449 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
450 {
451     int i, b;
452     const kiss_fft_state *kfft;
453     VARDECL(kiss_fft_cpx, in);
454     VARDECL(kiss_fft_cpx, out);
455     int N = 480, N2=240;
456     float * OPUS_RESTRICT A = tonal->angle;
457     float * OPUS_RESTRICT dA = tonal->d_angle;
458     float * OPUS_RESTRICT d2A = tonal->d2_angle;
459     VARDECL(float, tonality);
460     VARDECL(float, noisiness);
461     float band_tonality[NB_TBANDS];
462     float logE[NB_TBANDS];
463     float BFCC[8];
464     float features[25];
465     float frame_tonality;
466     float max_frame_tonality;
467     /*float tw_sum=0;*/
468     float frame_noisiness;
469     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
470     float slope=0;
471     float frame_stationarity;
472     float relativeE;
473     float frame_probs[2];
474     float alpha, alphaE, alphaE2;
475     float frame_loudness;
476     float bandwidth_mask;
477     int is_masked[NB_TBANDS+1];
478     int bandwidth=0;
479     float maxE = 0;
480     float noise_floor;
481     int remaining;
482     AnalysisInfo *info;
483     float hp_ener;
484     float tonality2[240];
485     float midE[8];
486     float spec_variability=0;
487     float band_log2[NB_TBANDS+1];
488     float leakage_from[NB_TBANDS+1];
489     float leakage_to[NB_TBANDS+1];
490     float layer_out[MAX_NEURONS];
491     float below_max_pitch;
492     float above_max_pitch;
493     int is_silence;
494     SAVE_STACK;
495 
496     if (!tonal->initialized)
497     {
498        tonal->mem_fill = 240;
499        tonal->initialized = 1;
500     }
501     alpha = 1.f/IMIN(10, 1+tonal->count);
502     alphaE = 1.f/IMIN(25, 1+tonal->count);
503     /* Noise floor related decay for bandwidth detection: -2.2 dB/second */
504     alphaE2 = 1.f/IMIN(100, 1+tonal->count);
505     if (tonal->count <= 1) alphaE2 = 1;
506 
507     if (tonal->Fs == 48000)
508     {
509        /* len and offset are now at 24 kHz. */
510        len/= 2;
511        offset /= 2;
512     } else if (tonal->Fs == 16000) {
513        len = 3*len/2;
514        offset = 3*offset/2;
515     }
516 
517     kfft = celt_mode->mdct.kfft[0];
518     tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
519           &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
520           IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
521     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
522     {
523        tonal->mem_fill += len;
524        /* Don't have enough to update the analysis */
525        RESTORE_STACK;
526        return;
527     }
528     hp_ener = tonal->hp_ener_accum;
529     info = &tonal->info[tonal->write_pos++];
530     if (tonal->write_pos>=DETECT_SIZE)
531        tonal->write_pos-=DETECT_SIZE;
532 
533     is_silence = is_digital_silence32(tonal->inmem, ANALYSIS_BUF_SIZE, 1, lsb_depth);
534 
535     ALLOC(in, 480, kiss_fft_cpx);
536     ALLOC(out, 480, kiss_fft_cpx);
537     ALLOC(tonality, 240, float);
538     ALLOC(noisiness, 240, float);
539     for (i=0;i<N2;i++)
540     {
541        float w = analysis_window[i];
542        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
543        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
544        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
545        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
546     }
547     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
548     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
549     tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
550           &tonal->inmem[240], tonal->downmix_state, remaining,
551           offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
552     tonal->mem_fill = 240 + remaining;
553     if (is_silence)
554     {
555        /* On silence, copy the previous analysis. */
556        int prev_pos = tonal->write_pos-2;
557        if (prev_pos < 0)
558           prev_pos += DETECT_SIZE;
559        OPUS_COPY(info, &tonal->info[prev_pos], 1);
560        RESTORE_STACK;
561        return;
562     }
563     opus_fft(kfft, in, out, tonal->arch);
564 #ifndef FIXED_POINT
565     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
566     if (celt_isnan(out[0].r))
567     {
568        info->valid = 0;
569        RESTORE_STACK;
570        return;
571     }
572 #endif
573 
574     for (i=1;i<N2;i++)
575     {
576        float X1r, X2r, X1i, X2i;
577        float angle, d_angle, d2_angle;
578        float angle2, d_angle2, d2_angle2;
579        float mod1, mod2, avg_mod;
580        X1r = (float)out[i].r+out[N-i].r;
581        X1i = (float)out[i].i-out[N-i].i;
582        X2r = (float)out[i].i+out[N-i].i;
583        X2i = (float)out[N-i].r-out[i].r;
584 
585        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
586        d_angle = angle - A[i];
587        d2_angle = d_angle - dA[i];
588 
589        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
590        d_angle2 = angle2 - angle;
591        d2_angle2 = d_angle2 - d_angle;
592 
593        mod1 = d2_angle - (float)float2int(d2_angle);
594        noisiness[i] = ABS16(mod1);
595        mod1 *= mod1;
596        mod1 *= mod1;
597 
598        mod2 = d2_angle2 - (float)float2int(d2_angle2);
599        noisiness[i] += ABS16(mod2);
600        mod2 *= mod2;
601        mod2 *= mod2;
602 
603        avg_mod = .25f*(d2A[i]+mod1+2*mod2);
604        /* This introduces an extra delay of 2 frames in the detection. */
605        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
606        /* No delay on this detection, but it's less reliable. */
607        tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
608 
609        A[i] = angle2;
610        dA[i] = d_angle2;
611        d2A[i] = mod2;
612     }
613     for (i=2;i<N2-1;i++)
614     {
615        float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
616        tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
617     }
618     frame_tonality = 0;
619     max_frame_tonality = 0;
620     /*tw_sum = 0;*/
621     info->activity = 0;
622     frame_noisiness = 0;
623     frame_stationarity = 0;
624     if (!tonal->count)
625     {
626        for (b=0;b<NB_TBANDS;b++)
627        {
628           tonal->lowE[b] = 1e10;
629           tonal->highE[b] = -1e10;
630        }
631     }
632     relativeE = 0;
633     frame_loudness = 0;
634     /* The energy of the very first band is special because of DC. */
635     {
636        float E = 0;
637        float X1r, X2r;
638        X1r = 2*(float)out[0].r;
639        X2r = 2*(float)out[0].i;
640        E = X1r*X1r + X2r*X2r;
641        for (i=1;i<4;i++)
642        {
643           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
644                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
645           E += binE;
646        }
647        E = SCALE_ENER(E);
648        band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
649     }
650     for (b=0;b<NB_TBANDS;b++)
651     {
652        float E=0, tE=0, nE=0;
653        float L1, L2;
654        float stationarity;
655        for (i=tbands[b];i<tbands[b+1];i++)
656        {
657           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
658                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
659           binE = SCALE_ENER(binE);
660           E += binE;
661           tE += binE*MAX32(0, tonality[i]);
662           nE += binE*2.f*(.5f-noisiness[i]);
663        }
664 #ifndef FIXED_POINT
665        /* Check for extreme band energies that could cause NaNs later. */
666        if (!(E<1e9f) || celt_isnan(E))
667        {
668           info->valid = 0;
669           RESTORE_STACK;
670           return;
671        }
672 #endif
673 
674        tonal->E[tonal->E_count][b] = E;
675        frame_noisiness += nE/(1e-15f+E);
676 
677        frame_loudness += (float)sqrt(E+1e-10f);
678        logE[b] = (float)log(E+1e-10f);
679        band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
680        tonal->logE[tonal->E_count][b] = logE[b];
681        if (tonal->count==0)
682           tonal->highE[b] = tonal->lowE[b] = logE[b];
683        if (tonal->highE[b] > tonal->lowE[b] + 7.5)
684        {
685           if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
686              tonal->highE[b] -= .01f;
687           else
688              tonal->lowE[b] += .01f;
689        }
690        if (logE[b] > tonal->highE[b])
691        {
692           tonal->highE[b] = logE[b];
693           tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
694        } else if (logE[b] < tonal->lowE[b])
695        {
696           tonal->lowE[b] = logE[b];
697           tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
698        }
699        relativeE += (logE[b]-tonal->lowE[b])/(1e-5f + (tonal->highE[b]-tonal->lowE[b]));
700 
701        L1=L2=0;
702        for (i=0;i<NB_FRAMES;i++)
703        {
704           L1 += (float)sqrt(tonal->E[i][b]);
705           L2 += tonal->E[i][b];
706        }
707 
708        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
709        stationarity *= stationarity;
710        stationarity *= stationarity;
711        frame_stationarity += stationarity;
712        /*band_tonality[b] = tE/(1e-15+E)*/;
713        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
714 #if 0
715        if (b>=NB_TONAL_SKIP_BANDS)
716        {
717           frame_tonality += tweight[b]*band_tonality[b];
718           tw_sum += tweight[b];
719        }
720 #else
721        frame_tonality += band_tonality[b];
722        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
723           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
724 #endif
725        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
726        slope += band_tonality[b]*(b-8);
727        /*printf("%f %f ", band_tonality[b], stationarity);*/
728        tonal->prev_band_tonality[b] = band_tonality[b];
729     }
730 
731     leakage_from[0] = band_log2[0];
732     leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
733     for (b=1;b<NB_TBANDS+1;b++)
734     {
735        float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
736        leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
737        leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
738     }
739     for (b=NB_TBANDS-2;b>=0;b--)
740     {
741        float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
742        leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
743        leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
744     }
745     celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
746     for (b=0;b<NB_TBANDS+1;b++)
747     {
748        /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
749           represents the boost needed to overcome the amount of analysis leakage
750           cause in a weaker band b by louder neighbouring bands.
751           The second, based on leakage_from[], applies to a loud band b for
752           which the quantization noise causes synthesis leakage to the weaker
753           neighbouring bands. */
754        float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
755              MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
756        info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
757     }
758     for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
759 
760     for (i=0;i<NB_FRAMES;i++)
761     {
762        int j;
763        float mindist = 1e15f;
764        for (j=0;j<NB_FRAMES;j++)
765        {
766           int k;
767           float dist=0;
768           for (k=0;k<NB_TBANDS;k++)
769           {
770              float tmp;
771              tmp = tonal->logE[i][k] - tonal->logE[j][k];
772              dist += tmp*tmp;
773           }
774           if (j!=i)
775              mindist = MIN32(mindist, dist);
776        }
777        spec_variability += mindist;
778     }
779     spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
780     bandwidth_mask = 0;
781     bandwidth = 0;
782     maxE = 0;
783     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
784     noise_floor *= noise_floor;
785     below_max_pitch=0;
786     above_max_pitch=0;
787     for (b=0;b<NB_TBANDS;b++)
788     {
789        float E=0;
790        float Em;
791        int band_start, band_end;
792        /* Keep a margin of 300 Hz for aliasing */
793        band_start = tbands[b];
794        band_end = tbands[b+1];
795        for (i=band_start;i<band_end;i++)
796        {
797           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
798                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
799           E += binE;
800        }
801        E = SCALE_ENER(E);
802        maxE = MAX32(maxE, E);
803        if (band_start < 64)
804        {
805           below_max_pitch += E;
806        } else {
807           above_max_pitch += E;
808        }
809        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
810        Em = MAX32(E, tonal->meanE[b]);
811        /* Consider the band "active" only if all these conditions are met:
812           1) less than 90 dB below the peak band (maximal masking possible considering
813              both the ATH and the loudness-dependent slope of the spreading function)
814           2) above the PCM quantization noise floor
815           We use b+1 because the first CELT band isn't included in tbands[]
816        */
817        if (E*1e9f > maxE && (Em > 3*noise_floor*(band_end-band_start) || E > noise_floor*(band_end-band_start)))
818           bandwidth = b+1;
819        /* Check if the band is masked (see below). */
820        is_masked[b] = E < (tonal->prev_bandwidth >= b+1  ? .01f : .05f)*bandwidth_mask;
821        /* Use a simple follower with 13 dB/Bark slope for spreading function. */
822        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
823     }
824     /* Special case for the last two bands, for which we don't have spectrum but only
825        the energy above 12 kHz. The difficulty here is that the high-pass we use
826        leaks some LF energy, so we need to increase the threshold without accidentally cutting
827        off the band. */
828     if (tonal->Fs == 48000) {
829        float noise_ratio;
830        float Em;
831        float E = hp_ener*(1.f/(60*60));
832        noise_ratio = tonal->prev_bandwidth==20 ? 10.f : 30.f;
833 
834 #ifdef FIXED_POINT
835        /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
836        E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
837 #endif
838        above_max_pitch += E;
839        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
840        Em = MAX32(E, tonal->meanE[b]);
841        if (Em > 3*noise_ratio*noise_floor*160 || E > noise_ratio*noise_floor*160)
842           bandwidth = 20;
843        /* Check if the band is masked (see below). */
844        is_masked[b] = E < (tonal->prev_bandwidth == 20  ? .01f : .05f)*bandwidth_mask;
845     }
846     if (above_max_pitch > below_max_pitch)
847        info->max_pitch_ratio = below_max_pitch/above_max_pitch;
848     else
849        info->max_pitch_ratio = 1;
850     /* In some cases, resampling aliasing can create a small amount of energy in the first band
851        being cut. So if the last band is masked, we don't include it.  */
852     if (bandwidth == 20 && is_masked[NB_TBANDS])
853        bandwidth-=2;
854     else if (bandwidth > 0 && bandwidth <= NB_TBANDS && is_masked[bandwidth-1])
855        bandwidth--;
856     if (tonal->count<=2)
857        bandwidth = 20;
858     frame_loudness = 20*(float)log10(frame_loudness);
859     tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
860     tonal->lowECount *= (1-alphaE);
861     if (frame_loudness < tonal->Etracker-30)
862        tonal->lowECount += alphaE;
863 
864     for (i=0;i<8;i++)
865     {
866        float sum=0;
867        for (b=0;b<16;b++)
868           sum += dct_table[i*16+b]*logE[b];
869        BFCC[i] = sum;
870     }
871     for (i=0;i<8;i++)
872     {
873        float sum=0;
874        for (b=0;b<16;b++)
875           sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
876        midE[i] = sum;
877     }
878 
879     frame_stationarity /= NB_TBANDS;
880     relativeE /= NB_TBANDS;
881     if (tonal->count<10)
882        relativeE = .5f;
883     frame_noisiness /= NB_TBANDS;
884 #if 1
885     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
886 #else
887     info->activity = .5*(1+frame_noisiness-frame_stationarity);
888 #endif
889     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
890     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
891     tonal->prev_tonality = frame_tonality;
892 
893     slope /= 8*8;
894     info->tonality_slope = slope;
895 
896     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
897     tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
898     info->tonality = frame_tonality;
899 
900     for (i=0;i<4;i++)
901        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
902 
903     for (i=0;i<4;i++)
904        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
905 
906     for (i=0;i<4;i++)
907         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
908     for (i=0;i<3;i++)
909         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
910 
911     if (tonal->count > 5)
912     {
913        for (i=0;i<9;i++)
914           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
915     }
916     for (i=0;i<4;i++)
917        features[i] = BFCC[i]-midE[i];
918 
919     for (i=0;i<8;i++)
920     {
921        tonal->mem[i+24] = tonal->mem[i+16];
922        tonal->mem[i+16] = tonal->mem[i+8];
923        tonal->mem[i+8] = tonal->mem[i];
924        tonal->mem[i] = BFCC[i];
925     }
926     for (i=0;i<9;i++)
927        features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
928     features[18] = spec_variability - 0.78f;
929     features[20] = info->tonality - 0.154723f;
930     features[21] = info->activity - 0.724643f;
931     features[22] = frame_stationarity - 0.743717f;
932     features[23] = info->tonality_slope + 0.069216f;
933     features[24] = tonal->lowECount - 0.067930f;
934 
935     compute_dense(&layer0, layer_out, features);
936     compute_gru(&layer1, tonal->rnn_state, layer_out);
937     compute_dense(&layer2, frame_probs, tonal->rnn_state);
938 
939     /* Probability of speech or music vs noise */
940     info->activity_probability = frame_probs[1];
941     info->music_prob = frame_probs[0];
942 
943     /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/
944 #ifdef MLP_TRAINING
945     for (i=0;i<25;i++)
946        printf("%f ", features[i]);
947     printf("\n");
948 #endif
949 
950     info->bandwidth = bandwidth;
951     tonal->prev_bandwidth = bandwidth;
952     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
953     info->noisiness = frame_noisiness;
954     info->valid = 1;
955     RESTORE_STACK;
956 }
957 
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)958 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
959                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
960                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
961 {
962    int offset;
963    int pcm_len;
964 
965    analysis_frame_size -= analysis_frame_size&1;
966    if (analysis_pcm != NULL)
967    {
968       /* Avoid overflow/wrap-around of the analysis buffer */
969       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
970 
971       pcm_len = analysis_frame_size - analysis->analysis_offset;
972       offset = analysis->analysis_offset;
973       while (pcm_len>0) {
974          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
975          offset += Fs/50;
976          pcm_len -= Fs/50;
977       }
978       analysis->analysis_offset = analysis_frame_size;
979 
980       analysis->analysis_offset -= frame_size;
981    }
982 
983    tonality_get_info(analysis, analysis_info, frame_size);
984 }
985 
986 #endif /* DISABLE_FLOAT_API */
987