1 /* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3 /*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #define ANALYSIS_C
33
34 #include <stdio.h>
35
36 #include "mathops.h"
37 #include "kiss_fft.h"
38 #include "celt.h"
39 #include "modes.h"
40 #include "arch.h"
41 #include "quant_bands.h"
42 #include "analysis.h"
43 #include "mlp.h"
44 #include "stack_alloc.h"
45 #include "float_cast.h"
46
47 #ifndef M_PI
48 #define M_PI 3.141592653
49 #endif
50
51 #ifndef DISABLE_FLOAT_API
52
53 static const float dct_table[128] = {
54 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
55 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
56 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
57 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
58 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
59 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
60 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
61 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
62 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
63 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
64 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
65 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
66 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
67 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
68 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
69 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
70 };
71
72 static const float analysis_window[240] = {
73 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
74 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
75 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
76 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
77 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
78 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
79 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
80 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
81 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
82 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
83 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
84 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
85 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
86 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
87 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
88 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
89 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
90 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
91 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
92 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
93 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
94 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
95 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
96 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
97 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
98 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
99 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
100 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
101 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
102 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
103 };
104
105 static const int tbands[NB_TBANDS+1] = {
106 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
107 };
108
109 #define NB_TONAL_SKIP_BANDS 9
110
silk_resampler_down2_hp(opus_val32 * S,opus_val32 * out,const opus_val32 * in,int inLen)111 static opus_val32 silk_resampler_down2_hp(
112 opus_val32 *S, /* I/O State vector [ 2 ] */
113 opus_val32 *out, /* O Output signal [ floor(len/2) ] */
114 const opus_val32 *in, /* I Input signal [ len ] */
115 int inLen /* I Number of input samples */
116 )
117 {
118 int k, len2 = inLen/2;
119 opus_val32 in32, out32, out32_hp, Y, X;
120 opus_val64 hp_ener = 0;
121 /* Internal variables and state are in Q10 format */
122 for( k = 0; k < len2; k++ ) {
123 /* Convert to Q10 */
124 in32 = in[ 2 * k ];
125
126 /* All-pass section for even input sample */
127 Y = SUB32( in32, S[ 0 ] );
128 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
129 out32 = ADD32( S[ 0 ], X );
130 S[ 0 ] = ADD32( in32, X );
131 out32_hp = out32;
132 /* Convert to Q10 */
133 in32 = in[ 2 * k + 1 ];
134
135 /* All-pass section for odd input sample, and add to output of previous section */
136 Y = SUB32( in32, S[ 1 ] );
137 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
138 out32 = ADD32( out32, S[ 1 ] );
139 out32 = ADD32( out32, X );
140 S[ 1 ] = ADD32( in32, X );
141
142 Y = SUB32( -in32, S[ 2 ] );
143 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
144 out32_hp = ADD32( out32_hp, S[ 2 ] );
145 out32_hp = ADD32( out32_hp, X );
146 S[ 2 ] = ADD32( -in32, X );
147
148 hp_ener += out32_hp*(opus_val64)out32_hp;
149 /* Add, convert back to int16 and store to output */
150 out[ k ] = HALF32(out32);
151 }
152 #ifdef FIXED_POINT
153 /* len2 can be up to 480, so we shift by 8 more to make it fit. */
154 hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
155 #endif
156 return (opus_val32)hp_ener;
157 }
158
downmix_and_resample(downmix_func downmix,const void * _x,opus_val32 * y,opus_val32 S[3],int subframe,int offset,int c1,int c2,int C,int Fs)159 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
160 {
161 VARDECL(opus_val32, tmp);
162 opus_val32 scale;
163 int j;
164 opus_val32 ret = 0;
165 SAVE_STACK;
166
167 if (subframe==0) return 0;
168 if (Fs == 48000)
169 {
170 subframe *= 2;
171 offset *= 2;
172 } else if (Fs == 16000) {
173 subframe = subframe*2/3;
174 offset = offset*2/3;
175 }
176 ALLOC(tmp, subframe, opus_val32);
177
178 downmix(_x, tmp, subframe, offset, c1, c2, C);
179 #ifdef FIXED_POINT
180 scale = (1<<SIG_SHIFT);
181 #else
182 scale = 1.f/32768;
183 #endif
184 if (c2==-2)
185 scale /= C;
186 else if (c2>-1)
187 scale /= 2;
188 for (j=0;j<subframe;j++)
189 tmp[j] *= scale;
190 if (Fs == 48000)
191 {
192 ret = silk_resampler_down2_hp(S, y, tmp, subframe);
193 } else if (Fs == 24000) {
194 OPUS_COPY(y, tmp, subframe);
195 } else if (Fs == 16000) {
196 VARDECL(opus_val32, tmp3x);
197 ALLOC(tmp3x, 3*subframe, opus_val32);
198 /* Don't do this at home! This resampler is horrible and it's only (barely)
199 usable for the purpose of the analysis because we don't care about all
200 the aliasing between 8 kHz and 12 kHz. */
201 for (j=0;j<subframe;j++)
202 {
203 tmp3x[3*j] = tmp[j];
204 tmp3x[3*j+1] = tmp[j];
205 tmp3x[3*j+2] = tmp[j];
206 }
207 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
208 }
209 RESTORE_STACK;
210 return ret;
211 }
212
tonality_analysis_init(TonalityAnalysisState * tonal,opus_int32 Fs)213 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
214 {
215 /* Initialize reusable fields. */
216 tonal->arch = opus_select_arch();
217 tonal->Fs = Fs;
218 /* Clear remaining fields. */
219 tonality_analysis_reset(tonal);
220 }
221
tonality_analysis_reset(TonalityAnalysisState * tonal)222 void tonality_analysis_reset(TonalityAnalysisState *tonal)
223 {
224 /* Clear non-reusable fields. */
225 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
226 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
227 tonal->music_confidence = .9f;
228 tonal->speech_confidence = .1f;
229 }
230
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)231 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
232 {
233 int pos;
234 int curr_lookahead;
235 float psum;
236 float tonality_max;
237 float tonality_avg;
238 int tonality_count;
239 int i;
240
241 pos = tonal->read_pos;
242 curr_lookahead = tonal->write_pos-tonal->read_pos;
243 if (curr_lookahead<0)
244 curr_lookahead += DETECT_SIZE;
245
246 /* On long frames, look at the second analysis window rather than the first. */
247 if (len > tonal->Fs/50 && pos != tonal->write_pos)
248 {
249 pos++;
250 if (pos==DETECT_SIZE)
251 pos=0;
252 }
253 if (pos == tonal->write_pos)
254 pos--;
255 if (pos<0)
256 pos = DETECT_SIZE-1;
257 OPUS_COPY(info_out, &tonal->info[pos], 1);
258 tonality_max = tonality_avg = info_out->tonality;
259 tonality_count = 1;
260 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
261 for (i=0;i<3;i++)
262 {
263 pos++;
264 if (pos==DETECT_SIZE)
265 pos = 0;
266 if (pos == tonal->write_pos)
267 break;
268 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
269 tonality_avg += tonal->info[pos].tonality;
270 tonality_count++;
271 }
272 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
273 tonal->read_subframe += len/(tonal->Fs/400);
274 while (tonal->read_subframe>=8)
275 {
276 tonal->read_subframe -= 8;
277 tonal->read_pos++;
278 }
279 if (tonal->read_pos>=DETECT_SIZE)
280 tonal->read_pos-=DETECT_SIZE;
281
282 /* The -1 is to compensate for the delay in the features themselves. */
283 curr_lookahead = IMAX(curr_lookahead-1, 0);
284
285 psum=0;
286 /* Summing the probability of transition patterns that involve music at
287 time (DETECT_SIZE-curr_lookahead-1) */
288 for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
289 psum += tonal->pmusic[i];
290 for (;i<DETECT_SIZE;i++)
291 psum += tonal->pspeech[i];
292 psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
293 /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/
294
295 info_out->music_prob = psum;
296 }
297
298 static const float std_feature_bias[9] = {
299 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
300 2.163313f, 1.260756f, 1.116868f, 1.918795f
301 };
302
303 #define LEAKAGE_OFFSET 2.5f
304 #define LEAKAGE_SLOPE 2.f
305
306 #ifdef FIXED_POINT
307 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
308 compensate for that in the energy. */
309 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
310 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
311 #else
312 #define SCALE_ENER(e) (e)
313 #endif
314
tonality_analysis(TonalityAnalysisState * tonal,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)315 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
316 {
317 int i, b;
318 const kiss_fft_state *kfft;
319 VARDECL(kiss_fft_cpx, in);
320 VARDECL(kiss_fft_cpx, out);
321 int N = 480, N2=240;
322 float * OPUS_RESTRICT A = tonal->angle;
323 float * OPUS_RESTRICT dA = tonal->d_angle;
324 float * OPUS_RESTRICT d2A = tonal->d2_angle;
325 VARDECL(float, tonality);
326 VARDECL(float, noisiness);
327 float band_tonality[NB_TBANDS];
328 float logE[NB_TBANDS];
329 float BFCC[8];
330 float features[25];
331 float frame_tonality;
332 float max_frame_tonality;
333 /*float tw_sum=0;*/
334 float frame_noisiness;
335 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
336 float slope=0;
337 float frame_stationarity;
338 float relativeE;
339 float frame_probs[2];
340 float alpha, alphaE, alphaE2;
341 float frame_loudness;
342 float bandwidth_mask;
343 int bandwidth=0;
344 float maxE = 0;
345 float noise_floor;
346 int remaining;
347 AnalysisInfo *info;
348 float hp_ener;
349 float tonality2[240];
350 float midE[8];
351 float spec_variability=0;
352 float band_log2[NB_TBANDS+1];
353 float leakage_from[NB_TBANDS+1];
354 float leakage_to[NB_TBANDS+1];
355 SAVE_STACK;
356
357 alpha = 1.f/IMIN(10, 1+tonal->count);
358 alphaE = 1.f/IMIN(25, 1+tonal->count);
359 alphaE2 = 1.f/IMIN(500, 1+tonal->count);
360
361 if (tonal->Fs == 48000)
362 {
363 /* len and offset are now at 24 kHz. */
364 len/= 2;
365 offset /= 2;
366 } else if (tonal->Fs == 16000) {
367 len = 3*len/2;
368 offset = 3*offset/2;
369 }
370
371 if (tonal->count<4) {
372 if (tonal->application == OPUS_APPLICATION_VOIP)
373 tonal->music_prob = .1f;
374 else
375 tonal->music_prob = .625f;
376 }
377 kfft = celt_mode->mdct.kfft[0];
378 if (tonal->count==0)
379 tonal->mem_fill = 240;
380 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
381 &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
382 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
383 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
384 {
385 tonal->mem_fill += len;
386 /* Don't have enough to update the analysis */
387 RESTORE_STACK;
388 return;
389 }
390 hp_ener = tonal->hp_ener_accum;
391 info = &tonal->info[tonal->write_pos++];
392 if (tonal->write_pos>=DETECT_SIZE)
393 tonal->write_pos-=DETECT_SIZE;
394
395 ALLOC(in, 480, kiss_fft_cpx);
396 ALLOC(out, 480, kiss_fft_cpx);
397 ALLOC(tonality, 240, float);
398 ALLOC(noisiness, 240, float);
399 for (i=0;i<N2;i++)
400 {
401 float w = analysis_window[i];
402 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
403 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
404 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
405 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
406 }
407 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
408 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
409 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
410 &tonal->inmem[240], tonal->downmix_state, remaining,
411 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
412 tonal->mem_fill = 240 + remaining;
413 opus_fft(kfft, in, out, tonal->arch);
414 #ifndef FIXED_POINT
415 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
416 if (celt_isnan(out[0].r))
417 {
418 info->valid = 0;
419 RESTORE_STACK;
420 return;
421 }
422 #endif
423
424 for (i=1;i<N2;i++)
425 {
426 float X1r, X2r, X1i, X2i;
427 float angle, d_angle, d2_angle;
428 float angle2, d_angle2, d2_angle2;
429 float mod1, mod2, avg_mod;
430 X1r = (float)out[i].r+out[N-i].r;
431 X1i = (float)out[i].i-out[N-i].i;
432 X2r = (float)out[i].i+out[N-i].i;
433 X2i = (float)out[N-i].r-out[i].r;
434
435 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
436 d_angle = angle - A[i];
437 d2_angle = d_angle - dA[i];
438
439 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
440 d_angle2 = angle2 - angle;
441 d2_angle2 = d_angle2 - d_angle;
442
443 mod1 = d2_angle - (float)float2int(d2_angle);
444 noisiness[i] = ABS16(mod1);
445 mod1 *= mod1;
446 mod1 *= mod1;
447
448 mod2 = d2_angle2 - (float)float2int(d2_angle2);
449 noisiness[i] += ABS16(mod2);
450 mod2 *= mod2;
451 mod2 *= mod2;
452
453 avg_mod = .25f*(d2A[i]+mod1+2*mod2);
454 /* This introduces an extra delay of 2 frames in the detection. */
455 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
456 /* No delay on this detection, but it's less reliable. */
457 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
458
459 A[i] = angle2;
460 dA[i] = d_angle2;
461 d2A[i] = mod2;
462 }
463 for (i=2;i<N2-1;i++)
464 {
465 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
466 tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
467 }
468 frame_tonality = 0;
469 max_frame_tonality = 0;
470 /*tw_sum = 0;*/
471 info->activity = 0;
472 frame_noisiness = 0;
473 frame_stationarity = 0;
474 if (!tonal->count)
475 {
476 for (b=0;b<NB_TBANDS;b++)
477 {
478 tonal->lowE[b] = 1e10;
479 tonal->highE[b] = -1e10;
480 }
481 }
482 relativeE = 0;
483 frame_loudness = 0;
484 /* The energy of the very first band is special because of DC. */
485 {
486 float E = 0;
487 float X1r, X2r;
488 X1r = 2*(float)out[0].r;
489 X2r = 2*(float)out[0].i;
490 E = X1r*X1r + X2r*X2r;
491 for (i=1;i<4;i++)
492 {
493 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
494 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
495 E += binE;
496 }
497 E = SCALE_ENER(E);
498 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
499 }
500 for (b=0;b<NB_TBANDS;b++)
501 {
502 float E=0, tE=0, nE=0;
503 float L1, L2;
504 float stationarity;
505 for (i=tbands[b];i<tbands[b+1];i++)
506 {
507 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
508 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
509 binE = SCALE_ENER(binE);
510 E += binE;
511 tE += binE*MAX32(0, tonality[i]);
512 nE += binE*2.f*(.5f-noisiness[i]);
513 }
514 #ifndef FIXED_POINT
515 /* Check for extreme band energies that could cause NaNs later. */
516 if (!(E<1e9f) || celt_isnan(E))
517 {
518 info->valid = 0;
519 RESTORE_STACK;
520 return;
521 }
522 #endif
523
524 tonal->E[tonal->E_count][b] = E;
525 frame_noisiness += nE/(1e-15f+E);
526
527 frame_loudness += (float)sqrt(E+1e-10f);
528 logE[b] = (float)log(E+1e-10f);
529 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
530 tonal->logE[tonal->E_count][b] = logE[b];
531 if (tonal->count==0)
532 tonal->highE[b] = tonal->lowE[b] = logE[b];
533 if (tonal->highE[b] > tonal->lowE[b] + 7.5)
534 {
535 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
536 tonal->highE[b] -= .01f;
537 else
538 tonal->lowE[b] += .01f;
539 }
540 if (logE[b] > tonal->highE[b])
541 {
542 tonal->highE[b] = logE[b];
543 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
544 } else if (logE[b] < tonal->lowE[b])
545 {
546 tonal->lowE[b] = logE[b];
547 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
548 }
549 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b]));
550
551 L1=L2=0;
552 for (i=0;i<NB_FRAMES;i++)
553 {
554 L1 += (float)sqrt(tonal->E[i][b]);
555 L2 += tonal->E[i][b];
556 }
557
558 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
559 stationarity *= stationarity;
560 stationarity *= stationarity;
561 frame_stationarity += stationarity;
562 /*band_tonality[b] = tE/(1e-15+E)*/;
563 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
564 #if 0
565 if (b>=NB_TONAL_SKIP_BANDS)
566 {
567 frame_tonality += tweight[b]*band_tonality[b];
568 tw_sum += tweight[b];
569 }
570 #else
571 frame_tonality += band_tonality[b];
572 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
573 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
574 #endif
575 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
576 slope += band_tonality[b]*(b-8);
577 /*printf("%f %f ", band_tonality[b], stationarity);*/
578 tonal->prev_band_tonality[b] = band_tonality[b];
579 }
580
581 leakage_from[0] = band_log2[0];
582 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
583 for (b=1;b<NB_TBANDS+1;b++)
584 {
585 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
586 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
587 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
588 }
589 for (b=NB_TBANDS-2;b>=0;b--)
590 {
591 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
592 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
593 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
594 }
595 celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
596 for (b=0;b<NB_TBANDS+1;b++)
597 {
598 /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
599 represents the boost needed to overcome the amount of analysis leakage
600 cause in a weaker band b by louder neighbouring bands.
601 The second, based on leakage_from[], applies to a loud band b for
602 which the quantization noise causes synthesis leakage to the weaker
603 neighbouring bands. */
604 float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
605 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
606 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
607 }
608 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
609
610 for (i=0;i<NB_FRAMES;i++)
611 {
612 int j;
613 float mindist = 1e15f;
614 for (j=0;j<NB_FRAMES;j++)
615 {
616 int k;
617 float dist=0;
618 for (k=0;k<NB_TBANDS;k++)
619 {
620 float tmp;
621 tmp = tonal->logE[i][k] - tonal->logE[j][k];
622 dist += tmp*tmp;
623 }
624 if (j!=i)
625 mindist = MIN32(mindist, dist);
626 }
627 spec_variability += mindist;
628 }
629 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
630 bandwidth_mask = 0;
631 bandwidth = 0;
632 maxE = 0;
633 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
634 noise_floor *= noise_floor;
635 for (b=0;b<NB_TBANDS;b++)
636 {
637 float E=0;
638 int band_start, band_end;
639 /* Keep a margin of 300 Hz for aliasing */
640 band_start = tbands[b];
641 band_end = tbands[b+1];
642 for (i=band_start;i<band_end;i++)
643 {
644 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
645 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
646 E += binE;
647 }
648 E = SCALE_ENER(E);
649 maxE = MAX32(maxE, E);
650 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
651 E = MAX32(E, tonal->meanE[b]);
652 /* Use a simple follower with 13 dB/Bark slope for spreading function */
653 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
654 /* Consider the band "active" only if all these conditions are met:
655 1) less than 10 dB below the simple follower
656 2) less than 90 dB below the peak band (maximal masking possible considering
657 both the ATH and the loudness-dependent slope of the spreading function)
658 3) above the PCM quantization noise floor
659 We use b+1 because the first CELT band isn't included in tbands[]
660 */
661 if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
662 bandwidth = b+1;
663 }
664 /* Special case for the last two bands, for which we don't have spectrum but only
665 the energy above 12 kHz. */
666 if (tonal->Fs == 48000) {
667 float ratio;
668 float E = hp_ener*(1.f/(240*240));
669 ratio = tonal->prev_bandwidth==20 ? 0.03f : 0.07f;
670 #ifdef FIXED_POINT
671 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
672 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
673 #endif
674 maxE = MAX32(maxE, E);
675 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
676 E = MAX32(E, tonal->meanE[b]);
677 /* Use a simple follower with 13 dB/Bark slope for spreading function */
678 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
679 if (E>ratio*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160)
680 bandwidth = 20;
681 /* This detector is unreliable, so if the bandwidth is close to SWB, assume it's FB. */
682 if (bandwidth >= 17)
683 bandwidth = 20;
684 }
685 if (tonal->count<=2)
686 bandwidth = 20;
687 frame_loudness = 20*(float)log10(frame_loudness);
688 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
689 tonal->lowECount *= (1-alphaE);
690 if (frame_loudness < tonal->Etracker-30)
691 tonal->lowECount += alphaE;
692
693 for (i=0;i<8;i++)
694 {
695 float sum=0;
696 for (b=0;b<16;b++)
697 sum += dct_table[i*16+b]*logE[b];
698 BFCC[i] = sum;
699 }
700 for (i=0;i<8;i++)
701 {
702 float sum=0;
703 for (b=0;b<16;b++)
704 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
705 midE[i] = sum;
706 }
707
708 frame_stationarity /= NB_TBANDS;
709 relativeE /= NB_TBANDS;
710 if (tonal->count<10)
711 relativeE = .5f;
712 frame_noisiness /= NB_TBANDS;
713 #if 1
714 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
715 #else
716 info->activity = .5*(1+frame_noisiness-frame_stationarity);
717 #endif
718 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
719 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
720 tonal->prev_tonality = frame_tonality;
721
722 slope /= 8*8;
723 info->tonality_slope = slope;
724
725 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
726 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
727 info->tonality = frame_tonality;
728
729 for (i=0;i<4;i++)
730 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
731
732 for (i=0;i<4;i++)
733 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
734
735 for (i=0;i<4;i++)
736 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
737 for (i=0;i<3;i++)
738 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
739
740 if (tonal->count > 5)
741 {
742 for (i=0;i<9;i++)
743 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
744 }
745 for (i=0;i<4;i++)
746 features[i] = BFCC[i]-midE[i];
747
748 for (i=0;i<8;i++)
749 {
750 tonal->mem[i+24] = tonal->mem[i+16];
751 tonal->mem[i+16] = tonal->mem[i+8];
752 tonal->mem[i+8] = tonal->mem[i];
753 tonal->mem[i] = BFCC[i];
754 }
755 for (i=0;i<9;i++)
756 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
757 features[18] = spec_variability - 0.78f;
758 features[20] = info->tonality - 0.154723f;
759 features[21] = info->activity - 0.724643f;
760 features[22] = frame_stationarity - 0.743717f;
761 features[23] = info->tonality_slope + 0.069216f;
762 features[24] = tonal->lowECount - 0.067930f;
763
764 mlp_process(&net, features, frame_probs);
765 frame_probs[0] = .5f*(frame_probs[0]+1);
766 /* Curve fitting between the MLP probability and the actual probability */
767 /*frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);*/
768 /* Probability of active audio (as opposed to silence) */
769 frame_probs[1] = .5f*frame_probs[1]+.5f;
770 frame_probs[1] *= frame_probs[1];
771
772 /* Probability of speech or music vs noise */
773 info->activity_probability = frame_probs[1];
774
775 /*printf("%f %f\n", frame_probs[0], frame_probs[1]);*/
776 {
777 /* Probability of state transition */
778 float tau;
779 /* Represents independence of the MLP probabilities, where
780 beta=1 means fully independent. */
781 float beta;
782 /* Denormalized probability of speech (p0) and music (p1) after update */
783 float p0, p1;
784 /* Probabilities for "all speech" and "all music" */
785 float s0, m0;
786 /* Probability sum for renormalisation */
787 float psum;
788 /* Instantaneous probability of speech and music, with beta pre-applied. */
789 float speech0;
790 float music0;
791 float p, q;
792
793 /* More silence transitions for speech than for music. */
794 tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);
795 p = MAX16(.05f,MIN16(.95f,frame_probs[1]));
796 q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));
797 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
798 /* p0 and p1 are the probabilities of speech and music at this frame
799 using only information from previous frame and applying the
800 state transition model */
801 p0 = (1-tonal->vad_prob)*(1-tau) + tonal->vad_prob *tau;
802 p1 = tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;
803 /* We apply the current probability with exponent beta to work around
804 the fact that the probability estimates aren't independent. */
805 p0 *= (float)pow(1-frame_probs[1], beta);
806 p1 *= (float)pow(frame_probs[1], beta);
807 /* Normalise the probabilities to get the Marokv probability of music. */
808 tonal->vad_prob = p1/(p0+p1);
809 info->vad_prob = tonal->vad_prob;
810 /* Consider that silence has a 50-50 probability of being speech or music. */
811 frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f;
812
813 /* One transition every 3 minutes of active audio */
814 tau = .0001f;
815 /* Adapt beta based on how "unexpected" the new prob is */
816 p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
817 q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
818 beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
819 /* p0 and p1 are the probabilities of speech and music at this frame
820 using only information from previous frame and applying the
821 state transition model */
822 p0 = (1-tonal->music_prob)*(1-tau) + tonal->music_prob *tau;
823 p1 = tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
824 /* We apply the current probability with exponent beta to work around
825 the fact that the probability estimates aren't independent. */
826 p0 *= (float)pow(1-frame_probs[0], beta);
827 p1 *= (float)pow(frame_probs[0], beta);
828 /* Normalise the probabilities to get the Marokv probability of music. */
829 tonal->music_prob = p1/(p0+p1);
830 info->music_prob = tonal->music_prob;
831
832 /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/
833 /* This chunk of code deals with delayed decision. */
834 psum=1e-20f;
835 /* Instantaneous probability of speech and music, with beta pre-applied. */
836 speech0 = (float)pow(1-frame_probs[0], beta);
837 music0 = (float)pow(frame_probs[0], beta);
838 if (tonal->count==1)
839 {
840 if (tonal->application == OPUS_APPLICATION_VOIP)
841 tonal->pmusic[0] = .1f;
842 else
843 tonal->pmusic[0] = .625f;
844 tonal->pspeech[0] = 1-tonal->pmusic[0];
845 }
846 /* Updated probability of having only speech (s0) or only music (m0),
847 before considering the new observation. */
848 s0 = tonal->pspeech[0] + tonal->pspeech[1];
849 m0 = tonal->pmusic [0] + tonal->pmusic [1];
850 /* Updates s0 and m0 with instantaneous probability. */
851 tonal->pspeech[0] = s0*(1-tau)*speech0;
852 tonal->pmusic [0] = m0*(1-tau)*music0;
853 /* Propagate the transition probabilities */
854 for (i=1;i<DETECT_SIZE-1;i++)
855 {
856 tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
857 tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
858 }
859 /* Probability that the latest frame is speech, when all the previous ones were music. */
860 tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
861 /* Probability that the latest frame is music, when all the previous ones were speech. */
862 tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
863
864 /* Renormalise probabilities to 1 */
865 for (i=0;i<DETECT_SIZE;i++)
866 psum += tonal->pspeech[i] + tonal->pmusic[i];
867 psum = 1.f/psum;
868 for (i=0;i<DETECT_SIZE;i++)
869 {
870 tonal->pspeech[i] *= psum;
871 tonal->pmusic [i] *= psum;
872 }
873 psum = tonal->pmusic[0];
874 for (i=1;i<DETECT_SIZE;i++)
875 psum += tonal->pspeech[i];
876
877 /* Estimate our confidence in the speech/music decisions */
878 if (frame_probs[1]>.75)
879 {
880 if (tonal->music_prob>.9)
881 {
882 float adapt;
883 adapt = 1.f/(++tonal->music_confidence_count);
884 tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
885 tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
886 }
887 if (tonal->music_prob<.1)
888 {
889 float adapt;
890 adapt = 1.f/(++tonal->speech_confidence_count);
891 tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
892 tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
893 }
894 }
895 }
896 tonal->last_music = tonal->music_prob>.5f;
897 #ifdef MLP_TRAINING
898 for (i=0;i<25;i++)
899 printf("%f ", features[i]);
900 printf("\n");
901 #endif
902
903 info->bandwidth = bandwidth;
904 tonal->prev_bandwidth = bandwidth;
905 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
906 info->noisiness = frame_noisiness;
907 info->valid = 1;
908 RESTORE_STACK;
909 }
910
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)911 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
912 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
913 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
914 {
915 int offset;
916 int pcm_len;
917
918 analysis_frame_size -= analysis_frame_size&1;
919 if (analysis_pcm != NULL)
920 {
921 /* Avoid overflow/wrap-around of the analysis buffer */
922 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
923
924 pcm_len = analysis_frame_size - analysis->analysis_offset;
925 offset = analysis->analysis_offset;
926 while (pcm_len>0) {
927 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
928 offset += Fs/50;
929 pcm_len -= Fs/50;
930 }
931 analysis->analysis_offset = analysis_frame_size;
932
933 analysis->analysis_offset -= frame_size;
934 }
935
936 analysis_info->valid = 0;
937 tonality_get_info(analysis, analysis_info, frame_size);
938 }
939
940 #endif /* DISABLE_FLOAT_API */
941