1 /* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3 /*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #define ANALYSIS_C
33
34 #include <stdio.h>
35
36 #include "mathops.h"
37 #include "kiss_fft.h"
38 #include "celt.h"
39 #include "modes.h"
40 #include "arch.h"
41 #include "quant_bands.h"
42 #include "analysis.h"
43 #include "mlp.h"
44 #include "stack_alloc.h"
45 #include "float_cast.h"
46
47 #ifndef M_PI
48 #define M_PI 3.141592653
49 #endif
50
51 #ifndef DISABLE_FLOAT_API
52
53 #define TRANSITION_PENALTY 10
54
55 static const float dct_table[128] = {
56 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
57 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
58 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
59 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
60 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
61 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
62 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
63 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
64 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
65 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
66 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
67 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
68 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
69 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
70 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
71 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
72 };
73
74 static const float analysis_window[240] = {
75 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
76 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
77 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
78 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
79 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
80 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
81 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
82 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
83 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
84 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
85 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
86 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
87 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
88 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
89 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
90 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
91 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
92 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
93 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
94 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
95 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
96 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
97 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
98 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
99 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
100 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
101 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
102 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
103 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
104 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
105 };
106
107 static const int tbands[NB_TBANDS+1] = {
108 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
109 };
110
111 #define NB_TONAL_SKIP_BANDS 9
112
silk_resampler_down2_hp(opus_val32 * S,opus_val32 * out,const opus_val32 * in,int inLen)113 static opus_val32 silk_resampler_down2_hp(
114 opus_val32 *S, /* I/O State vector [ 2 ] */
115 opus_val32 *out, /* O Output signal [ floor(len/2) ] */
116 const opus_val32 *in, /* I Input signal [ len ] */
117 int inLen /* I Number of input samples */
118 )
119 {
120 int k, len2 = inLen/2;
121 opus_val32 in32, out32, out32_hp, Y, X;
122 opus_val64 hp_ener = 0;
123 /* Internal variables and state are in Q10 format */
124 for( k = 0; k < len2; k++ ) {
125 /* Convert to Q10 */
126 in32 = in[ 2 * k ];
127
128 /* All-pass section for even input sample */
129 Y = SUB32( in32, S[ 0 ] );
130 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
131 out32 = ADD32( S[ 0 ], X );
132 S[ 0 ] = ADD32( in32, X );
133 out32_hp = out32;
134 /* Convert to Q10 */
135 in32 = in[ 2 * k + 1 ];
136
137 /* All-pass section for odd input sample, and add to output of previous section */
138 Y = SUB32( in32, S[ 1 ] );
139 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
140 out32 = ADD32( out32, S[ 1 ] );
141 out32 = ADD32( out32, X );
142 S[ 1 ] = ADD32( in32, X );
143
144 Y = SUB32( -in32, S[ 2 ] );
145 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
146 out32_hp = ADD32( out32_hp, S[ 2 ] );
147 out32_hp = ADD32( out32_hp, X );
148 S[ 2 ] = ADD32( -in32, X );
149
150 hp_ener += out32_hp*(opus_val64)out32_hp;
151 /* Add, convert back to int16 and store to output */
152 out[ k ] = HALF32(out32);
153 }
154 #ifdef FIXED_POINT
155 /* len2 can be up to 480, so we shift by 8 more to make it fit. */
156 hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
157 #endif
158 return (opus_val32)hp_ener;
159 }
160
downmix_and_resample(downmix_func downmix,const void * _x,opus_val32 * y,opus_val32 S[3],int subframe,int offset,int c1,int c2,int C,int Fs)161 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
162 {
163 VARDECL(opus_val32, tmp);
164 opus_val32 scale;
165 int j;
166 opus_val32 ret = 0;
167 SAVE_STACK;
168
169 if (subframe==0) return 0;
170 if (Fs == 48000)
171 {
172 subframe *= 2;
173 offset *= 2;
174 } else if (Fs == 16000) {
175 subframe = subframe*2/3;
176 offset = offset*2/3;
177 }
178 ALLOC(tmp, subframe, opus_val32);
179
180 downmix(_x, tmp, subframe, offset, c1, c2, C);
181 #ifdef FIXED_POINT
182 scale = (1<<SIG_SHIFT);
183 #else
184 scale = 1.f/32768;
185 #endif
186 if (c2==-2)
187 scale /= C;
188 else if (c2>-1)
189 scale /= 2;
190 for (j=0;j<subframe;j++)
191 tmp[j] *= scale;
192 if (Fs == 48000)
193 {
194 ret = silk_resampler_down2_hp(S, y, tmp, subframe);
195 } else if (Fs == 24000) {
196 OPUS_COPY(y, tmp, subframe);
197 } else if (Fs == 16000) {
198 VARDECL(opus_val32, tmp3x);
199 ALLOC(tmp3x, 3*subframe, opus_val32);
200 /* Don't do this at home! This resampler is horrible and it's only (barely)
201 usable for the purpose of the analysis because we don't care about all
202 the aliasing between 8 kHz and 12 kHz. */
203 for (j=0;j<subframe;j++)
204 {
205 tmp3x[3*j] = tmp[j];
206 tmp3x[3*j+1] = tmp[j];
207 tmp3x[3*j+2] = tmp[j];
208 }
209 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
210 }
211 RESTORE_STACK;
212 return ret;
213 }
214
tonality_analysis_init(TonalityAnalysisState * tonal,opus_int32 Fs)215 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
216 {
217 /* Initialize reusable fields. */
218 tonal->arch = opus_select_arch();
219 tonal->Fs = Fs;
220 /* Clear remaining fields. */
221 tonality_analysis_reset(tonal);
222 }
223
tonality_analysis_reset(TonalityAnalysisState * tonal)224 void tonality_analysis_reset(TonalityAnalysisState *tonal)
225 {
226 /* Clear non-reusable fields. */
227 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
228 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
229 }
230
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)231 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
232 {
233 int pos;
234 int curr_lookahead;
235 float tonality_max;
236 float tonality_avg;
237 int tonality_count;
238 int i;
239 int pos0;
240 float prob_avg;
241 float prob_count;
242 float prob_min, prob_max;
243 float vad_prob;
244 int mpos, vpos;
245 int bandwidth_span;
246
247 pos = tonal->read_pos;
248 curr_lookahead = tonal->write_pos-tonal->read_pos;
249 if (curr_lookahead<0)
250 curr_lookahead += DETECT_SIZE;
251
252 /* On long frames, look at the second analysis window rather than the first. */
253 if (len > tonal->Fs/50 && pos != tonal->write_pos)
254 {
255 pos++;
256 if (pos==DETECT_SIZE)
257 pos=0;
258 }
259 if (pos == tonal->write_pos)
260 pos--;
261 if (pos<0)
262 pos = DETECT_SIZE-1;
263 pos0 = pos;
264 OPUS_COPY(info_out, &tonal->info[pos], 1);
265 tonality_max = tonality_avg = info_out->tonality;
266 tonality_count = 1;
267 /* Look at the neighbouring frames and pick largest bandwidth found (to be safe). */
268 bandwidth_span = 6;
269 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
270 for (i=0;i<3;i++)
271 {
272 pos++;
273 if (pos==DETECT_SIZE)
274 pos = 0;
275 if (pos == tonal->write_pos)
276 break;
277 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
278 tonality_avg += tonal->info[pos].tonality;
279 tonality_count++;
280 info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
281 bandwidth_span--;
282 }
283 pos = pos0;
284 /* Look back in time to see if any has a wider bandwidth than the current frame. */
285 for (i=0;i<bandwidth_span;i++)
286 {
287 pos--;
288 if (pos < 0)
289 pos = DETECT_SIZE-1;
290 if (pos == tonal->write_pos)
291 break;
292 info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
293 }
294 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
295
296 mpos = vpos = pos0;
297 /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and
298 ~1 frame delay in the VAD prob. */
299 if (curr_lookahead > 15)
300 {
301 mpos += 5;
302 if (mpos>=DETECT_SIZE)
303 mpos -= DETECT_SIZE;
304 vpos += 1;
305 if (vpos>=DETECT_SIZE)
306 vpos -= DETECT_SIZE;
307 }
308
309 /* The following calculations attempt to minimize a "badness function"
310 for the transition. When switching from speech to music, the badness
311 of switching at frame k is
312 b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
313 where
314 v_i is the activity probability (VAD) at frame i,
315 p_i is the music probability at frame i
316 T is the probability threshold for switching
317 S is the penalty for switching during active audio rather than silence
318 the current frame has index i=0
319
320 Rather than apply badness to directly decide when to switch, what we compute
321 instead is the threshold for which the optimal switching point is now. When
322 considering whether to switch now (frame 0) or at frame k, we have:
323 S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
324 which gives us:
325 T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i )
326 We take the min threshold across all positive values of k (up to the maximum
327 amount of lookahead we have) to give us the threshold for which the current
328 frame is the optimal switch point.
329
330 The last step is that we need to consider whether we want to switch at all.
331 For that we use the average of the music probability over the entire window.
332 If the threshold is higher than that average we're not going to
333 switch, so we compute a min with the average as well. The result of all these
334 min operations is music_prob_min, which gives the threshold for switching to music
335 if we're currently encoding for speech.
336
337 We do the exact opposite to compute music_prob_max which is used for switching
338 from music to speech.
339 */
340 prob_min = 1.f;
341 prob_max = 0.f;
342 vad_prob = tonal->info[vpos].activity_probability;
343 prob_count = MAX16(.1f, vad_prob);
344 prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob;
345 while (1)
346 {
347 float pos_vad;
348 mpos++;
349 if (mpos==DETECT_SIZE)
350 mpos = 0;
351 if (mpos == tonal->write_pos)
352 break;
353 vpos++;
354 if (vpos==DETECT_SIZE)
355 vpos = 0;
356 if (vpos == tonal->write_pos)
357 break;
358 pos_vad = tonal->info[vpos].activity_probability;
359 prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min);
360 prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max);
361 prob_count += MAX16(.1f, pos_vad);
362 prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob;
363 }
364 info_out->music_prob = prob_avg/prob_count;
365 prob_min = MIN16(prob_avg/prob_count, prob_min);
366 prob_max = MAX16(prob_avg/prob_count, prob_max);
367 prob_min = MAX16(prob_min, 0.f);
368 prob_max = MIN16(prob_max, 1.f);
369
370 /* If we don't have enough look-ahead, do our best to make a decent decision. */
371 if (curr_lookahead < 10)
372 {
373 float pmin, pmax;
374 pmin = prob_min;
375 pmax = prob_max;
376 pos = pos0;
377 /* Look for min/max in the past. */
378 for (i=0;i<IMIN(tonal->count-1, 15);i++)
379 {
380 pos--;
381 if (pos < 0)
382 pos = DETECT_SIZE-1;
383 pmin = MIN16(pmin, tonal->info[pos].music_prob);
384 pmax = MAX16(pmax, tonal->info[pos].music_prob);
385 }
386 /* Bias against switching on active audio. */
387 pmin = MAX16(0.f, pmin - .1f*vad_prob);
388 pmax = MIN16(1.f, pmax + .1f*vad_prob);
389 prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min);
390 prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max);
391 }
392 info_out->music_prob_min = prob_min;
393 info_out->music_prob_max = prob_max;
394
395 /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
396 tonal->read_subframe += len/(tonal->Fs/400);
397 while (tonal->read_subframe>=8)
398 {
399 tonal->read_subframe -= 8;
400 tonal->read_pos++;
401 }
402 if (tonal->read_pos>=DETECT_SIZE)
403 tonal->read_pos-=DETECT_SIZE;
404 }
405
406 static const float std_feature_bias[9] = {
407 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
408 2.163313f, 1.260756f, 1.116868f, 1.918795f
409 };
410
411 #define LEAKAGE_OFFSET 2.5f
412 #define LEAKAGE_SLOPE 2.f
413
414 #ifdef FIXED_POINT
415 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
416 compensate for that in the energy. */
417 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
418 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
419 #else
420 #define SCALE_ENER(e) (e)
421 #endif
422
tonality_analysis(TonalityAnalysisState * tonal,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)423 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
424 {
425 int i, b;
426 const kiss_fft_state *kfft;
427 VARDECL(kiss_fft_cpx, in);
428 VARDECL(kiss_fft_cpx, out);
429 int N = 480, N2=240;
430 float * OPUS_RESTRICT A = tonal->angle;
431 float * OPUS_RESTRICT dA = tonal->d_angle;
432 float * OPUS_RESTRICT d2A = tonal->d2_angle;
433 VARDECL(float, tonality);
434 VARDECL(float, noisiness);
435 float band_tonality[NB_TBANDS];
436 float logE[NB_TBANDS];
437 float BFCC[8];
438 float features[25];
439 float frame_tonality;
440 float max_frame_tonality;
441 /*float tw_sum=0;*/
442 float frame_noisiness;
443 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
444 float slope=0;
445 float frame_stationarity;
446 float relativeE;
447 float frame_probs[2];
448 float alpha, alphaE, alphaE2;
449 float frame_loudness;
450 float bandwidth_mask;
451 int is_masked[NB_TBANDS+1];
452 int bandwidth=0;
453 float maxE = 0;
454 float noise_floor;
455 int remaining;
456 AnalysisInfo *info;
457 float hp_ener;
458 float tonality2[240];
459 float midE[8];
460 float spec_variability=0;
461 float band_log2[NB_TBANDS+1];
462 float leakage_from[NB_TBANDS+1];
463 float leakage_to[NB_TBANDS+1];
464 float layer_out[MAX_NEURONS];
465 float below_max_pitch;
466 float above_max_pitch;
467 SAVE_STACK;
468
469 alpha = 1.f/IMIN(10, 1+tonal->count);
470 alphaE = 1.f/IMIN(25, 1+tonal->count);
471 /* Noise floor related decay for bandwidth detection: -2.2 dB/second */
472 alphaE2 = 1.f/IMIN(100, 1+tonal->count);
473 if (tonal->count <= 1) alphaE2 = 1;
474
475 if (tonal->Fs == 48000)
476 {
477 /* len and offset are now at 24 kHz. */
478 len/= 2;
479 offset /= 2;
480 } else if (tonal->Fs == 16000) {
481 len = 3*len/2;
482 offset = 3*offset/2;
483 }
484
485 kfft = celt_mode->mdct.kfft[0];
486 if (tonal->count==0)
487 tonal->mem_fill = 240;
488 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
489 &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
490 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
491 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
492 {
493 tonal->mem_fill += len;
494 /* Don't have enough to update the analysis */
495 RESTORE_STACK;
496 return;
497 }
498 hp_ener = tonal->hp_ener_accum;
499 info = &tonal->info[tonal->write_pos++];
500 if (tonal->write_pos>=DETECT_SIZE)
501 tonal->write_pos-=DETECT_SIZE;
502
503 ALLOC(in, 480, kiss_fft_cpx);
504 ALLOC(out, 480, kiss_fft_cpx);
505 ALLOC(tonality, 240, float);
506 ALLOC(noisiness, 240, float);
507 for (i=0;i<N2;i++)
508 {
509 float w = analysis_window[i];
510 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
511 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
512 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
513 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
514 }
515 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
516 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
517 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
518 &tonal->inmem[240], tonal->downmix_state, remaining,
519 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
520 tonal->mem_fill = 240 + remaining;
521 opus_fft(kfft, in, out, tonal->arch);
522 #ifndef FIXED_POINT
523 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
524 if (celt_isnan(out[0].r))
525 {
526 info->valid = 0;
527 RESTORE_STACK;
528 return;
529 }
530 #endif
531
532 for (i=1;i<N2;i++)
533 {
534 float X1r, X2r, X1i, X2i;
535 float angle, d_angle, d2_angle;
536 float angle2, d_angle2, d2_angle2;
537 float mod1, mod2, avg_mod;
538 X1r = (float)out[i].r+out[N-i].r;
539 X1i = (float)out[i].i-out[N-i].i;
540 X2r = (float)out[i].i+out[N-i].i;
541 X2i = (float)out[N-i].r-out[i].r;
542
543 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
544 d_angle = angle - A[i];
545 d2_angle = d_angle - dA[i];
546
547 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
548 d_angle2 = angle2 - angle;
549 d2_angle2 = d_angle2 - d_angle;
550
551 mod1 = d2_angle - (float)float2int(d2_angle);
552 noisiness[i] = ABS16(mod1);
553 mod1 *= mod1;
554 mod1 *= mod1;
555
556 mod2 = d2_angle2 - (float)float2int(d2_angle2);
557 noisiness[i] += ABS16(mod2);
558 mod2 *= mod2;
559 mod2 *= mod2;
560
561 avg_mod = .25f*(d2A[i]+mod1+2*mod2);
562 /* This introduces an extra delay of 2 frames in the detection. */
563 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
564 /* No delay on this detection, but it's less reliable. */
565 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
566
567 A[i] = angle2;
568 dA[i] = d_angle2;
569 d2A[i] = mod2;
570 }
571 for (i=2;i<N2-1;i++)
572 {
573 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
574 tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
575 }
576 frame_tonality = 0;
577 max_frame_tonality = 0;
578 /*tw_sum = 0;*/
579 info->activity = 0;
580 frame_noisiness = 0;
581 frame_stationarity = 0;
582 if (!tonal->count)
583 {
584 for (b=0;b<NB_TBANDS;b++)
585 {
586 tonal->lowE[b] = 1e10;
587 tonal->highE[b] = -1e10;
588 }
589 }
590 relativeE = 0;
591 frame_loudness = 0;
592 /* The energy of the very first band is special because of DC. */
593 {
594 float E = 0;
595 float X1r, X2r;
596 X1r = 2*(float)out[0].r;
597 X2r = 2*(float)out[0].i;
598 E = X1r*X1r + X2r*X2r;
599 for (i=1;i<4;i++)
600 {
601 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
602 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
603 E += binE;
604 }
605 E = SCALE_ENER(E);
606 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
607 }
608 for (b=0;b<NB_TBANDS;b++)
609 {
610 float E=0, tE=0, nE=0;
611 float L1, L2;
612 float stationarity;
613 for (i=tbands[b];i<tbands[b+1];i++)
614 {
615 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
616 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
617 binE = SCALE_ENER(binE);
618 E += binE;
619 tE += binE*MAX32(0, tonality[i]);
620 nE += binE*2.f*(.5f-noisiness[i]);
621 }
622 #ifndef FIXED_POINT
623 /* Check for extreme band energies that could cause NaNs later. */
624 if (!(E<1e9f) || celt_isnan(E))
625 {
626 info->valid = 0;
627 RESTORE_STACK;
628 return;
629 }
630 #endif
631
632 tonal->E[tonal->E_count][b] = E;
633 frame_noisiness += nE/(1e-15f+E);
634
635 frame_loudness += (float)sqrt(E+1e-10f);
636 logE[b] = (float)log(E+1e-10f);
637 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
638 tonal->logE[tonal->E_count][b] = logE[b];
639 if (tonal->count==0)
640 tonal->highE[b] = tonal->lowE[b] = logE[b];
641 if (tonal->highE[b] > tonal->lowE[b] + 7.5)
642 {
643 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
644 tonal->highE[b] -= .01f;
645 else
646 tonal->lowE[b] += .01f;
647 }
648 if (logE[b] > tonal->highE[b])
649 {
650 tonal->highE[b] = logE[b];
651 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
652 } else if (logE[b] < tonal->lowE[b])
653 {
654 tonal->lowE[b] = logE[b];
655 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
656 }
657 relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b]));
658
659 L1=L2=0;
660 for (i=0;i<NB_FRAMES;i++)
661 {
662 L1 += (float)sqrt(tonal->E[i][b]);
663 L2 += tonal->E[i][b];
664 }
665
666 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
667 stationarity *= stationarity;
668 stationarity *= stationarity;
669 frame_stationarity += stationarity;
670 /*band_tonality[b] = tE/(1e-15+E)*/;
671 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
672 #if 0
673 if (b>=NB_TONAL_SKIP_BANDS)
674 {
675 frame_tonality += tweight[b]*band_tonality[b];
676 tw_sum += tweight[b];
677 }
678 #else
679 frame_tonality += band_tonality[b];
680 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
681 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
682 #endif
683 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
684 slope += band_tonality[b]*(b-8);
685 /*printf("%f %f ", band_tonality[b], stationarity);*/
686 tonal->prev_band_tonality[b] = band_tonality[b];
687 }
688
689 leakage_from[0] = band_log2[0];
690 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
691 for (b=1;b<NB_TBANDS+1;b++)
692 {
693 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
694 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
695 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
696 }
697 for (b=NB_TBANDS-2;b>=0;b--)
698 {
699 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
700 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
701 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
702 }
703 celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
704 for (b=0;b<NB_TBANDS+1;b++)
705 {
706 /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
707 represents the boost needed to overcome the amount of analysis leakage
708 cause in a weaker band b by louder neighbouring bands.
709 The second, based on leakage_from[], applies to a loud band b for
710 which the quantization noise causes synthesis leakage to the weaker
711 neighbouring bands. */
712 float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
713 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
714 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
715 }
716 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
717
718 for (i=0;i<NB_FRAMES;i++)
719 {
720 int j;
721 float mindist = 1e15f;
722 for (j=0;j<NB_FRAMES;j++)
723 {
724 int k;
725 float dist=0;
726 for (k=0;k<NB_TBANDS;k++)
727 {
728 float tmp;
729 tmp = tonal->logE[i][k] - tonal->logE[j][k];
730 dist += tmp*tmp;
731 }
732 if (j!=i)
733 mindist = MIN32(mindist, dist);
734 }
735 spec_variability += mindist;
736 }
737 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
738 bandwidth_mask = 0;
739 bandwidth = 0;
740 maxE = 0;
741 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
742 noise_floor *= noise_floor;
743 below_max_pitch=0;
744 above_max_pitch=0;
745 for (b=0;b<NB_TBANDS;b++)
746 {
747 float E=0;
748 float Em;
749 int band_start, band_end;
750 /* Keep a margin of 300 Hz for aliasing */
751 band_start = tbands[b];
752 band_end = tbands[b+1];
753 for (i=band_start;i<band_end;i++)
754 {
755 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
756 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
757 E += binE;
758 }
759 E = SCALE_ENER(E);
760 maxE = MAX32(maxE, E);
761 if (band_start < 64)
762 {
763 below_max_pitch += E;
764 } else {
765 above_max_pitch += E;
766 }
767 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
768 Em = MAX32(E, tonal->meanE[b]);
769 /* Consider the band "active" only if all these conditions are met:
770 1) less than 90 dB below the peak band (maximal masking possible considering
771 both the ATH and the loudness-dependent slope of the spreading function)
772 2) above the PCM quantization noise floor
773 We use b+1 because the first CELT band isn't included in tbands[]
774 */
775 if (E*1e9f > maxE && (Em > 3*noise_floor*(band_end-band_start) || E > noise_floor*(band_end-band_start)))
776 bandwidth = b+1;
777 /* Check if the band is masked (see below). */
778 is_masked[b] = E < (tonal->prev_bandwidth >= b+1 ? .01f : .05f)*bandwidth_mask;
779 /* Use a simple follower with 13 dB/Bark slope for spreading function. */
780 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
781 }
782 /* Special case for the last two bands, for which we don't have spectrum but only
783 the energy above 12 kHz. The difficulty here is that the high-pass we use
784 leaks some LF energy, so we need to increase the threshold without accidentally cutting
785 off the band. */
786 if (tonal->Fs == 48000) {
787 float noise_ratio;
788 float Em;
789 float E = hp_ener*(1.f/(60*60));
790 noise_ratio = tonal->prev_bandwidth==20 ? 10.f : 30.f;
791
792 #ifdef FIXED_POINT
793 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
794 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
795 #endif
796 above_max_pitch += E;
797 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
798 Em = MAX32(E, tonal->meanE[b]);
799 if (Em > 3*noise_ratio*noise_floor*160 || E > noise_ratio*noise_floor*160)
800 bandwidth = 20;
801 /* Check if the band is masked (see below). */
802 is_masked[b] = E < (tonal->prev_bandwidth == 20 ? .01f : .05f)*bandwidth_mask;
803 }
804 if (above_max_pitch > below_max_pitch)
805 info->max_pitch_ratio = below_max_pitch/above_max_pitch;
806 else
807 info->max_pitch_ratio = 1;
808 /* In some cases, resampling aliasing can create a small amount of energy in the first band
809 being cut. So if the last band is masked, we don't include it. */
810 if (bandwidth == 20 && is_masked[NB_TBANDS])
811 bandwidth-=2;
812 else if (bandwidth > 0 && bandwidth <= NB_TBANDS && is_masked[bandwidth-1])
813 bandwidth--;
814 if (tonal->count<=2)
815 bandwidth = 20;
816 frame_loudness = 20*(float)log10(frame_loudness);
817 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
818 tonal->lowECount *= (1-alphaE);
819 if (frame_loudness < tonal->Etracker-30)
820 tonal->lowECount += alphaE;
821
822 for (i=0;i<8;i++)
823 {
824 float sum=0;
825 for (b=0;b<16;b++)
826 sum += dct_table[i*16+b]*logE[b];
827 BFCC[i] = sum;
828 }
829 for (i=0;i<8;i++)
830 {
831 float sum=0;
832 for (b=0;b<16;b++)
833 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
834 midE[i] = sum;
835 }
836
837 frame_stationarity /= NB_TBANDS;
838 relativeE /= NB_TBANDS;
839 if (tonal->count<10)
840 relativeE = .5f;
841 frame_noisiness /= NB_TBANDS;
842 #if 1
843 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
844 #else
845 info->activity = .5*(1+frame_noisiness-frame_stationarity);
846 #endif
847 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
848 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
849 tonal->prev_tonality = frame_tonality;
850
851 slope /= 8*8;
852 info->tonality_slope = slope;
853
854 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
855 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
856 info->tonality = frame_tonality;
857
858 for (i=0;i<4;i++)
859 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
860
861 for (i=0;i<4;i++)
862 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
863
864 for (i=0;i<4;i++)
865 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
866 for (i=0;i<3;i++)
867 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
868
869 if (tonal->count > 5)
870 {
871 for (i=0;i<9;i++)
872 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
873 }
874 for (i=0;i<4;i++)
875 features[i] = BFCC[i]-midE[i];
876
877 for (i=0;i<8;i++)
878 {
879 tonal->mem[i+24] = tonal->mem[i+16];
880 tonal->mem[i+16] = tonal->mem[i+8];
881 tonal->mem[i+8] = tonal->mem[i];
882 tonal->mem[i] = BFCC[i];
883 }
884 for (i=0;i<9;i++)
885 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
886 features[18] = spec_variability - 0.78f;
887 features[20] = info->tonality - 0.154723f;
888 features[21] = info->activity - 0.724643f;
889 features[22] = frame_stationarity - 0.743717f;
890 features[23] = info->tonality_slope + 0.069216f;
891 features[24] = tonal->lowECount - 0.067930f;
892
893 compute_dense(&layer0, layer_out, features);
894 compute_gru(&layer1, tonal->rnn_state, layer_out);
895 compute_dense(&layer2, frame_probs, tonal->rnn_state);
896
897 /* Probability of speech or music vs noise */
898 info->activity_probability = frame_probs[1];
899 info->music_prob = frame_probs[0];
900
901 /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/
902 #ifdef MLP_TRAINING
903 for (i=0;i<25;i++)
904 printf("%f ", features[i]);
905 printf("\n");
906 #endif
907
908 info->bandwidth = bandwidth;
909 tonal->prev_bandwidth = bandwidth;
910 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
911 info->noisiness = frame_noisiness;
912 info->valid = 1;
913 RESTORE_STACK;
914 }
915
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)916 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
917 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
918 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
919 {
920 int offset;
921 int pcm_len;
922
923 analysis_frame_size -= analysis_frame_size&1;
924 if (analysis_pcm != NULL)
925 {
926 /* Avoid overflow/wrap-around of the analysis buffer */
927 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
928
929 pcm_len = analysis_frame_size - analysis->analysis_offset;
930 offset = analysis->analysis_offset;
931 while (pcm_len>0) {
932 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
933 offset += Fs/50;
934 pcm_len -= Fs/50;
935 }
936 analysis->analysis_offset = analysis_frame_size;
937
938 analysis->analysis_offset -= frame_size;
939 }
940
941 analysis_info->valid = 0;
942 tonality_get_info(analysis, analysis_info, frame_size);
943 }
944
945 #endif /* DISABLE_FLOAT_API */
946