1 /* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3 /*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #define ANALYSIS_C
33
34 #ifdef MLP_TRAINING
35 #include <stdio.h>
36 #endif
37
38 #include "mathops.h"
39 #include "kiss_fft.h"
40 #include "celt.h"
41 #include "modes.h"
42 #include "arch.h"
43 #include "quant_bands.h"
44 #include "analysis.h"
45 #include "mlp.h"
46 #include "stack_alloc.h"
47 #include "float_cast.h"
48
49 #ifndef M_PI
50 #define M_PI 3.141592653
51 #endif
52
53 #ifndef DISABLE_FLOAT_API
54
55 #define TRANSITION_PENALTY 10
56
57 static const float dct_table[128] = {
58 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
59 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
60 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
61 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
62 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
63 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
64 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
65 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
66 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
67 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
68 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
69 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
70 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
71 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
72 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
73 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
74 };
75
76 static const float analysis_window[240] = {
77 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
78 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
79 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
80 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
81 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
82 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
83 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
84 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
85 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
86 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
87 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
88 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
89 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
90 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
91 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
92 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
93 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
94 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
95 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
96 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
97 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
98 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
99 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
100 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
101 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
102 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
103 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
104 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
105 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
106 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
107 };
108
109 static const int tbands[NB_TBANDS+1] = {
110 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
111 };
112
113 #define NB_TONAL_SKIP_BANDS 9
114
silk_resampler_down2_hp(opus_val32 * S,opus_val32 * out,const opus_val32 * in,int inLen)115 static opus_val32 silk_resampler_down2_hp(
116 opus_val32 *S, /* I/O State vector [ 2 ] */
117 opus_val32 *out, /* O Output signal [ floor(len/2) ] */
118 const opus_val32 *in, /* I Input signal [ len ] */
119 int inLen /* I Number of input samples */
120 )
121 {
122 int k, len2 = inLen/2;
123 opus_val32 in32, out32, out32_hp, Y, X;
124 opus_val64 hp_ener = 0;
125 /* Internal variables and state are in Q10 format */
126 for( k = 0; k < len2; k++ ) {
127 /* Convert to Q10 */
128 in32 = in[ 2 * k ];
129
130 /* All-pass section for even input sample */
131 Y = SUB32( in32, S[ 0 ] );
132 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
133 out32 = ADD32( S[ 0 ], X );
134 S[ 0 ] = ADD32( in32, X );
135 out32_hp = out32;
136 /* Convert to Q10 */
137 in32 = in[ 2 * k + 1 ];
138
139 /* All-pass section for odd input sample, and add to output of previous section */
140 Y = SUB32( in32, S[ 1 ] );
141 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
142 out32 = ADD32( out32, S[ 1 ] );
143 out32 = ADD32( out32, X );
144 S[ 1 ] = ADD32( in32, X );
145
146 Y = SUB32( -in32, S[ 2 ] );
147 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
148 out32_hp = ADD32( out32_hp, S[ 2 ] );
149 out32_hp = ADD32( out32_hp, X );
150 S[ 2 ] = ADD32( -in32, X );
151
152 hp_ener += out32_hp*(opus_val64)out32_hp;
153 /* Add, convert back to int16 and store to output */
154 out[ k ] = HALF32(out32);
155 }
156 #ifdef FIXED_POINT
157 /* len2 can be up to 480, so we shift by 8 more to make it fit. */
158 hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
159 #endif
160 return (opus_val32)hp_ener;
161 }
162
downmix_and_resample(downmix_func downmix,const void * _x,opus_val32 * y,opus_val32 S[3],int subframe,int offset,int c1,int c2,int C,int Fs)163 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
164 {
165 VARDECL(opus_val32, tmp);
166 opus_val32 scale;
167 int j;
168 opus_val32 ret = 0;
169 SAVE_STACK;
170
171 if (subframe==0) return 0;
172 if (Fs == 48000)
173 {
174 subframe *= 2;
175 offset *= 2;
176 } else if (Fs == 16000) {
177 subframe = subframe*2/3;
178 offset = offset*2/3;
179 }
180 ALLOC(tmp, subframe, opus_val32);
181
182 downmix(_x, tmp, subframe, offset, c1, c2, C);
183 #ifdef FIXED_POINT
184 scale = (1<<SIG_SHIFT);
185 #else
186 scale = 1.f/32768;
187 #endif
188 if (c2==-2)
189 scale /= C;
190 else if (c2>-1)
191 scale /= 2;
192 for (j=0;j<subframe;j++)
193 tmp[j] *= scale;
194 if (Fs == 48000)
195 {
196 ret = silk_resampler_down2_hp(S, y, tmp, subframe);
197 } else if (Fs == 24000) {
198 OPUS_COPY(y, tmp, subframe);
199 } else if (Fs == 16000) {
200 VARDECL(opus_val32, tmp3x);
201 ALLOC(tmp3x, 3*subframe, opus_val32);
202 /* Don't do this at home! This resampler is horrible and it's only (barely)
203 usable for the purpose of the analysis because we don't care about all
204 the aliasing between 8 kHz and 12 kHz. */
205 for (j=0;j<subframe;j++)
206 {
207 tmp3x[3*j] = tmp[j];
208 tmp3x[3*j+1] = tmp[j];
209 tmp3x[3*j+2] = tmp[j];
210 }
211 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
212 }
213 RESTORE_STACK;
214 return ret;
215 }
216
tonality_analysis_init(TonalityAnalysisState * tonal,opus_int32 Fs)217 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
218 {
219 /* Initialize reusable fields. */
220 tonal->arch = opus_select_arch();
221 tonal->Fs = Fs;
222 /* Clear remaining fields. */
223 tonality_analysis_reset(tonal);
224 }
225
tonality_analysis_reset(TonalityAnalysisState * tonal)226 void tonality_analysis_reset(TonalityAnalysisState *tonal)
227 {
228 /* Clear non-reusable fields. */
229 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
230 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
231 }
232
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)233 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
234 {
235 int pos;
236 int curr_lookahead;
237 float tonality_max;
238 float tonality_avg;
239 int tonality_count;
240 int i;
241 int pos0;
242 float prob_avg;
243 float prob_count;
244 float prob_min, prob_max;
245 float vad_prob;
246 int mpos, vpos;
247 int bandwidth_span;
248
249 pos = tonal->read_pos;
250 curr_lookahead = tonal->write_pos-tonal->read_pos;
251 if (curr_lookahead<0)
252 curr_lookahead += DETECT_SIZE;
253
254 tonal->read_subframe += len/(tonal->Fs/400);
255 while (tonal->read_subframe>=8)
256 {
257 tonal->read_subframe -= 8;
258 tonal->read_pos++;
259 }
260 if (tonal->read_pos>=DETECT_SIZE)
261 tonal->read_pos-=DETECT_SIZE;
262
263 /* On long frames, look at the second analysis window rather than the first. */
264 if (len > tonal->Fs/50 && pos != tonal->write_pos)
265 {
266 pos++;
267 if (pos==DETECT_SIZE)
268 pos=0;
269 }
270 if (pos == tonal->write_pos)
271 pos--;
272 if (pos<0)
273 pos = DETECT_SIZE-1;
274 pos0 = pos;
275 OPUS_COPY(info_out, &tonal->info[pos], 1);
276 if (!info_out->valid)
277 return;
278 tonality_max = tonality_avg = info_out->tonality;
279 tonality_count = 1;
280 /* Look at the neighbouring frames and pick largest bandwidth found (to be safe). */
281 bandwidth_span = 6;
282 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
283 for (i=0;i<3;i++)
284 {
285 pos++;
286 if (pos==DETECT_SIZE)
287 pos = 0;
288 if (pos == tonal->write_pos)
289 break;
290 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
291 tonality_avg += tonal->info[pos].tonality;
292 tonality_count++;
293 info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
294 bandwidth_span--;
295 }
296 pos = pos0;
297 /* Look back in time to see if any has a wider bandwidth than the current frame. */
298 for (i=0;i<bandwidth_span;i++)
299 {
300 pos--;
301 if (pos < 0)
302 pos = DETECT_SIZE-1;
303 if (pos == tonal->write_pos)
304 break;
305 info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
306 }
307 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
308
309 mpos = vpos = pos0;
310 /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and
311 ~1 frame delay in the VAD prob. */
312 if (curr_lookahead > 15)
313 {
314 mpos += 5;
315 if (mpos>=DETECT_SIZE)
316 mpos -= DETECT_SIZE;
317 vpos += 1;
318 if (vpos>=DETECT_SIZE)
319 vpos -= DETECT_SIZE;
320 }
321
322 /* The following calculations attempt to minimize a "badness function"
323 for the transition. When switching from speech to music, the badness
324 of switching at frame k is
325 b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
326 where
327 v_i is the activity probability (VAD) at frame i,
328 p_i is the music probability at frame i
329 T is the probability threshold for switching
330 S is the penalty for switching during active audio rather than silence
331 the current frame has index i=0
332
333 Rather than apply badness to directly decide when to switch, what we compute
334 instead is the threshold for which the optimal switching point is now. When
335 considering whether to switch now (frame 0) or at frame k, we have:
336 S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
337 which gives us:
338 T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i )
339 We take the min threshold across all positive values of k (up to the maximum
340 amount of lookahead we have) to give us the threshold for which the current
341 frame is the optimal switch point.
342
343 The last step is that we need to consider whether we want to switch at all.
344 For that we use the average of the music probability over the entire window.
345 If the threshold is higher than that average we're not going to
346 switch, so we compute a min with the average as well. The result of all these
347 min operations is music_prob_min, which gives the threshold for switching to music
348 if we're currently encoding for speech.
349
350 We do the exact opposite to compute music_prob_max which is used for switching
351 from music to speech.
352 */
353 prob_min = 1.f;
354 prob_max = 0.f;
355 vad_prob = tonal->info[vpos].activity_probability;
356 prob_count = MAX16(.1f, vad_prob);
357 prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob;
358 while (1)
359 {
360 float pos_vad;
361 mpos++;
362 if (mpos==DETECT_SIZE)
363 mpos = 0;
364 if (mpos == tonal->write_pos)
365 break;
366 vpos++;
367 if (vpos==DETECT_SIZE)
368 vpos = 0;
369 if (vpos == tonal->write_pos)
370 break;
371 pos_vad = tonal->info[vpos].activity_probability;
372 prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min);
373 prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max);
374 prob_count += MAX16(.1f, pos_vad);
375 prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob;
376 }
377 info_out->music_prob = prob_avg/prob_count;
378 prob_min = MIN16(prob_avg/prob_count, prob_min);
379 prob_max = MAX16(prob_avg/prob_count, prob_max);
380 prob_min = MAX16(prob_min, 0.f);
381 prob_max = MIN16(prob_max, 1.f);
382
383 /* If we don't have enough look-ahead, do our best to make a decent decision. */
384 if (curr_lookahead < 10)
385 {
386 float pmin, pmax;
387 pmin = prob_min;
388 pmax = prob_max;
389 pos = pos0;
390 /* Look for min/max in the past. */
391 for (i=0;i<IMIN(tonal->count-1, 15);i++)
392 {
393 pos--;
394 if (pos < 0)
395 pos = DETECT_SIZE-1;
396 pmin = MIN16(pmin, tonal->info[pos].music_prob);
397 pmax = MAX16(pmax, tonal->info[pos].music_prob);
398 }
399 /* Bias against switching on active audio. */
400 pmin = MAX16(0.f, pmin - .1f*vad_prob);
401 pmax = MIN16(1.f, pmax + .1f*vad_prob);
402 prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min);
403 prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max);
404 }
405 info_out->music_prob_min = prob_min;
406 info_out->music_prob_max = prob_max;
407
408 /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
409 }
410
411 static const float std_feature_bias[9] = {
412 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
413 2.163313f, 1.260756f, 1.116868f, 1.918795f
414 };
415
416 #define LEAKAGE_OFFSET 2.5f
417 #define LEAKAGE_SLOPE 2.f
418
419 #ifdef FIXED_POINT
420 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
421 compensate for that in the energy. */
422 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
423 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
424 #else
425 #define SCALE_ENER(e) (e)
426 #endif
427
428 #ifdef FIXED_POINT
is_digital_silence32(const opus_val32 * pcm,int frame_size,int channels,int lsb_depth)429 static int is_digital_silence32(const opus_val32* pcm, int frame_size, int channels, int lsb_depth)
430 {
431 int silence = 0;
432 opus_val32 sample_max = 0;
433 #ifdef MLP_TRAINING
434 return 0;
435 #endif
436 sample_max = celt_maxabs32(pcm, frame_size*channels);
437
438 silence = (sample_max == 0);
439 (void)lsb_depth;
440 return silence;
441 }
442 #else
443 #define is_digital_silence32(pcm, frame_size, channels, lsb_depth) is_digital_silence(pcm, frame_size, channels, lsb_depth)
444 #endif
445
tonality_analysis(TonalityAnalysisState * tonal,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)446 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
447 {
448 int i, b;
449 const kiss_fft_state *kfft;
450 VARDECL(kiss_fft_cpx, in);
451 VARDECL(kiss_fft_cpx, out);
452 int N = 480, N2=240;
453 float * OPUS_RESTRICT A = tonal->angle;
454 float * OPUS_RESTRICT dA = tonal->d_angle;
455 float * OPUS_RESTRICT d2A = tonal->d2_angle;
456 VARDECL(float, tonality);
457 VARDECL(float, noisiness);
458 float band_tonality[NB_TBANDS];
459 float logE[NB_TBANDS];
460 float BFCC[8];
461 float features[25];
462 float frame_tonality;
463 float max_frame_tonality;
464 /*float tw_sum=0;*/
465 float frame_noisiness;
466 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
467 float slope=0;
468 float frame_stationarity;
469 float relativeE;
470 float frame_probs[2];
471 float alpha, alphaE, alphaE2;
472 float frame_loudness;
473 float bandwidth_mask;
474 int is_masked[NB_TBANDS+1];
475 int bandwidth=0;
476 float maxE = 0;
477 float noise_floor;
478 int remaining;
479 AnalysisInfo *info;
480 float hp_ener;
481 float tonality2[240];
482 float midE[8];
483 float spec_variability=0;
484 float band_log2[NB_TBANDS+1];
485 float leakage_from[NB_TBANDS+1];
486 float leakage_to[NB_TBANDS+1];
487 float layer_out[MAX_NEURONS];
488 float below_max_pitch;
489 float above_max_pitch;
490 int is_silence;
491 SAVE_STACK;
492
493 if (!tonal->initialized)
494 {
495 tonal->mem_fill = 240;
496 tonal->initialized = 1;
497 }
498 alpha = 1.f/IMIN(10, 1+tonal->count);
499 alphaE = 1.f/IMIN(25, 1+tonal->count);
500 /* Noise floor related decay for bandwidth detection: -2.2 dB/second */
501 alphaE2 = 1.f/IMIN(100, 1+tonal->count);
502 if (tonal->count <= 1) alphaE2 = 1;
503
504 if (tonal->Fs == 48000)
505 {
506 /* len and offset are now at 24 kHz. */
507 len/= 2;
508 offset /= 2;
509 } else if (tonal->Fs == 16000) {
510 len = 3*len/2;
511 offset = 3*offset/2;
512 }
513
514 kfft = celt_mode->mdct.kfft[0];
515 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
516 &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
517 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
518 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
519 {
520 tonal->mem_fill += len;
521 /* Don't have enough to update the analysis */
522 RESTORE_STACK;
523 return;
524 }
525 hp_ener = tonal->hp_ener_accum;
526 info = &tonal->info[tonal->write_pos++];
527 if (tonal->write_pos>=DETECT_SIZE)
528 tonal->write_pos-=DETECT_SIZE;
529
530 is_silence = is_digital_silence32(tonal->inmem, ANALYSIS_BUF_SIZE, 1, lsb_depth);
531
532 ALLOC(in, 480, kiss_fft_cpx);
533 ALLOC(out, 480, kiss_fft_cpx);
534 ALLOC(tonality, 240, float);
535 ALLOC(noisiness, 240, float);
536 for (i=0;i<N2;i++)
537 {
538 float w = analysis_window[i];
539 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
540 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
541 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
542 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
543 }
544 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
545 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
546 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
547 &tonal->inmem[240], tonal->downmix_state, remaining,
548 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
549 tonal->mem_fill = 240 + remaining;
550 if (is_silence)
551 {
552 /* On silence, copy the previous analysis. */
553 int prev_pos = tonal->write_pos-2;
554 if (prev_pos < 0)
555 prev_pos += DETECT_SIZE;
556 OPUS_COPY(info, &tonal->info[prev_pos], 1);
557 RESTORE_STACK;
558 return;
559 }
560 opus_fft(kfft, in, out, tonal->arch);
561 #ifndef FIXED_POINT
562 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
563 if (celt_isnan(out[0].r))
564 {
565 info->valid = 0;
566 RESTORE_STACK;
567 return;
568 }
569 #endif
570
571 for (i=1;i<N2;i++)
572 {
573 float X1r, X2r, X1i, X2i;
574 float angle, d_angle, d2_angle;
575 float angle2, d_angle2, d2_angle2;
576 float mod1, mod2, avg_mod;
577 X1r = (float)out[i].r+out[N-i].r;
578 X1i = (float)out[i].i-out[N-i].i;
579 X2r = (float)out[i].i+out[N-i].i;
580 X2i = (float)out[N-i].r-out[i].r;
581
582 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
583 d_angle = angle - A[i];
584 d2_angle = d_angle - dA[i];
585
586 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
587 d_angle2 = angle2 - angle;
588 d2_angle2 = d_angle2 - d_angle;
589
590 mod1 = d2_angle - (float)float2int(d2_angle);
591 noisiness[i] = ABS16(mod1);
592 mod1 *= mod1;
593 mod1 *= mod1;
594
595 mod2 = d2_angle2 - (float)float2int(d2_angle2);
596 noisiness[i] += ABS16(mod2);
597 mod2 *= mod2;
598 mod2 *= mod2;
599
600 avg_mod = .25f*(d2A[i]+mod1+2*mod2);
601 /* This introduces an extra delay of 2 frames in the detection. */
602 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
603 /* No delay on this detection, but it's less reliable. */
604 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
605
606 A[i] = angle2;
607 dA[i] = d_angle2;
608 d2A[i] = mod2;
609 }
610 for (i=2;i<N2-1;i++)
611 {
612 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
613 tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
614 }
615 frame_tonality = 0;
616 max_frame_tonality = 0;
617 /*tw_sum = 0;*/
618 info->activity = 0;
619 frame_noisiness = 0;
620 frame_stationarity = 0;
621 if (!tonal->count)
622 {
623 for (b=0;b<NB_TBANDS;b++)
624 {
625 tonal->lowE[b] = 1e10;
626 tonal->highE[b] = -1e10;
627 }
628 }
629 relativeE = 0;
630 frame_loudness = 0;
631 /* The energy of the very first band is special because of DC. */
632 {
633 float E = 0;
634 float X1r, X2r;
635 X1r = 2*(float)out[0].r;
636 X2r = 2*(float)out[0].i;
637 E = X1r*X1r + X2r*X2r;
638 for (i=1;i<4;i++)
639 {
640 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
641 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
642 E += binE;
643 }
644 E = SCALE_ENER(E);
645 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
646 }
647 for (b=0;b<NB_TBANDS;b++)
648 {
649 float E=0, tE=0, nE=0;
650 float L1, L2;
651 float stationarity;
652 for (i=tbands[b];i<tbands[b+1];i++)
653 {
654 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
655 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
656 binE = SCALE_ENER(binE);
657 E += binE;
658 tE += binE*MAX32(0, tonality[i]);
659 nE += binE*2.f*(.5f-noisiness[i]);
660 }
661 #ifndef FIXED_POINT
662 /* Check for extreme band energies that could cause NaNs later. */
663 if (!(E<1e9f) || celt_isnan(E))
664 {
665 info->valid = 0;
666 RESTORE_STACK;
667 return;
668 }
669 #endif
670
671 tonal->E[tonal->E_count][b] = E;
672 frame_noisiness += nE/(1e-15f+E);
673
674 frame_loudness += (float)sqrt(E+1e-10f);
675 logE[b] = (float)log(E+1e-10f);
676 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
677 tonal->logE[tonal->E_count][b] = logE[b];
678 if (tonal->count==0)
679 tonal->highE[b] = tonal->lowE[b] = logE[b];
680 if (tonal->highE[b] > tonal->lowE[b] + 7.5)
681 {
682 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
683 tonal->highE[b] -= .01f;
684 else
685 tonal->lowE[b] += .01f;
686 }
687 if (logE[b] > tonal->highE[b])
688 {
689 tonal->highE[b] = logE[b];
690 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
691 } else if (logE[b] < tonal->lowE[b])
692 {
693 tonal->lowE[b] = logE[b];
694 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
695 }
696 relativeE += (logE[b]-tonal->lowE[b])/(1e-5f + (tonal->highE[b]-tonal->lowE[b]));
697
698 L1=L2=0;
699 for (i=0;i<NB_FRAMES;i++)
700 {
701 L1 += (float)sqrt(tonal->E[i][b]);
702 L2 += tonal->E[i][b];
703 }
704
705 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
706 stationarity *= stationarity;
707 stationarity *= stationarity;
708 frame_stationarity += stationarity;
709 /*band_tonality[b] = tE/(1e-15+E)*/;
710 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
711 #if 0
712 if (b>=NB_TONAL_SKIP_BANDS)
713 {
714 frame_tonality += tweight[b]*band_tonality[b];
715 tw_sum += tweight[b];
716 }
717 #else
718 frame_tonality += band_tonality[b];
719 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
720 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
721 #endif
722 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
723 slope += band_tonality[b]*(b-8);
724 /*printf("%f %f ", band_tonality[b], stationarity);*/
725 tonal->prev_band_tonality[b] = band_tonality[b];
726 }
727
728 leakage_from[0] = band_log2[0];
729 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
730 for (b=1;b<NB_TBANDS+1;b++)
731 {
732 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
733 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
734 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
735 }
736 for (b=NB_TBANDS-2;b>=0;b--)
737 {
738 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
739 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
740 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
741 }
742 celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
743 for (b=0;b<NB_TBANDS+1;b++)
744 {
745 /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
746 represents the boost needed to overcome the amount of analysis leakage
747 cause in a weaker band b by louder neighbouring bands.
748 The second, based on leakage_from[], applies to a loud band b for
749 which the quantization noise causes synthesis leakage to the weaker
750 neighbouring bands. */
751 float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
752 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
753 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
754 }
755 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
756
757 for (i=0;i<NB_FRAMES;i++)
758 {
759 int j;
760 float mindist = 1e15f;
761 for (j=0;j<NB_FRAMES;j++)
762 {
763 int k;
764 float dist=0;
765 for (k=0;k<NB_TBANDS;k++)
766 {
767 float tmp;
768 tmp = tonal->logE[i][k] - tonal->logE[j][k];
769 dist += tmp*tmp;
770 }
771 if (j!=i)
772 mindist = MIN32(mindist, dist);
773 }
774 spec_variability += mindist;
775 }
776 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
777 bandwidth_mask = 0;
778 bandwidth = 0;
779 maxE = 0;
780 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
781 noise_floor *= noise_floor;
782 below_max_pitch=0;
783 above_max_pitch=0;
784 for (b=0;b<NB_TBANDS;b++)
785 {
786 float E=0;
787 float Em;
788 int band_start, band_end;
789 /* Keep a margin of 300 Hz for aliasing */
790 band_start = tbands[b];
791 band_end = tbands[b+1];
792 for (i=band_start;i<band_end;i++)
793 {
794 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
795 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
796 E += binE;
797 }
798 E = SCALE_ENER(E);
799 maxE = MAX32(maxE, E);
800 if (band_start < 64)
801 {
802 below_max_pitch += E;
803 } else {
804 above_max_pitch += E;
805 }
806 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
807 Em = MAX32(E, tonal->meanE[b]);
808 /* Consider the band "active" only if all these conditions are met:
809 1) less than 90 dB below the peak band (maximal masking possible considering
810 both the ATH and the loudness-dependent slope of the spreading function)
811 2) above the PCM quantization noise floor
812 We use b+1 because the first CELT band isn't included in tbands[]
813 */
814 if (E*1e9f > maxE && (Em > 3*noise_floor*(band_end-band_start) || E > noise_floor*(band_end-band_start)))
815 bandwidth = b+1;
816 /* Check if the band is masked (see below). */
817 is_masked[b] = E < (tonal->prev_bandwidth >= b+1 ? .01f : .05f)*bandwidth_mask;
818 /* Use a simple follower with 13 dB/Bark slope for spreading function. */
819 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
820 }
821 /* Special case for the last two bands, for which we don't have spectrum but only
822 the energy above 12 kHz. The difficulty here is that the high-pass we use
823 leaks some LF energy, so we need to increase the threshold without accidentally cutting
824 off the band. */
825 if (tonal->Fs == 48000) {
826 float noise_ratio;
827 float Em;
828 float E = hp_ener*(1.f/(60*60));
829 noise_ratio = tonal->prev_bandwidth==20 ? 10.f : 30.f;
830
831 #ifdef FIXED_POINT
832 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
833 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
834 #endif
835 above_max_pitch += E;
836 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
837 Em = MAX32(E, tonal->meanE[b]);
838 if (Em > 3*noise_ratio*noise_floor*160 || E > noise_ratio*noise_floor*160)
839 bandwidth = 20;
840 /* Check if the band is masked (see below). */
841 is_masked[b] = E < (tonal->prev_bandwidth == 20 ? .01f : .05f)*bandwidth_mask;
842 }
843 if (above_max_pitch > below_max_pitch)
844 info->max_pitch_ratio = below_max_pitch/above_max_pitch;
845 else
846 info->max_pitch_ratio = 1;
847 /* In some cases, resampling aliasing can create a small amount of energy in the first band
848 being cut. So if the last band is masked, we don't include it. */
849 if (bandwidth == 20 && is_masked[NB_TBANDS])
850 bandwidth-=2;
851 else if (bandwidth > 0 && bandwidth <= NB_TBANDS && is_masked[bandwidth-1])
852 bandwidth--;
853 if (tonal->count<=2)
854 bandwidth = 20;
855 frame_loudness = 20*(float)log10(frame_loudness);
856 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
857 tonal->lowECount *= (1-alphaE);
858 if (frame_loudness < tonal->Etracker-30)
859 tonal->lowECount += alphaE;
860
861 for (i=0;i<8;i++)
862 {
863 float sum=0;
864 for (b=0;b<16;b++)
865 sum += dct_table[i*16+b]*logE[b];
866 BFCC[i] = sum;
867 }
868 for (i=0;i<8;i++)
869 {
870 float sum=0;
871 for (b=0;b<16;b++)
872 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
873 midE[i] = sum;
874 }
875
876 frame_stationarity /= NB_TBANDS;
877 relativeE /= NB_TBANDS;
878 if (tonal->count<10)
879 relativeE = .5f;
880 frame_noisiness /= NB_TBANDS;
881 #if 1
882 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
883 #else
884 info->activity = .5*(1+frame_noisiness-frame_stationarity);
885 #endif
886 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
887 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
888 tonal->prev_tonality = frame_tonality;
889
890 slope /= 8*8;
891 info->tonality_slope = slope;
892
893 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
894 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
895 info->tonality = frame_tonality;
896
897 for (i=0;i<4;i++)
898 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
899
900 for (i=0;i<4;i++)
901 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
902
903 for (i=0;i<4;i++)
904 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
905 for (i=0;i<3;i++)
906 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
907
908 if (tonal->count > 5)
909 {
910 for (i=0;i<9;i++)
911 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
912 }
913 for (i=0;i<4;i++)
914 features[i] = BFCC[i]-midE[i];
915
916 for (i=0;i<8;i++)
917 {
918 tonal->mem[i+24] = tonal->mem[i+16];
919 tonal->mem[i+16] = tonal->mem[i+8];
920 tonal->mem[i+8] = tonal->mem[i];
921 tonal->mem[i] = BFCC[i];
922 }
923 for (i=0;i<9;i++)
924 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
925 features[18] = spec_variability - 0.78f;
926 features[20] = info->tonality - 0.154723f;
927 features[21] = info->activity - 0.724643f;
928 features[22] = frame_stationarity - 0.743717f;
929 features[23] = info->tonality_slope + 0.069216f;
930 features[24] = tonal->lowECount - 0.067930f;
931
932 compute_dense(&layer0, layer_out, features);
933 compute_gru(&layer1, tonal->rnn_state, layer_out);
934 compute_dense(&layer2, frame_probs, tonal->rnn_state);
935
936 /* Probability of speech or music vs noise */
937 info->activity_probability = frame_probs[1];
938 info->music_prob = frame_probs[0];
939
940 /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/
941 #ifdef MLP_TRAINING
942 for (i=0;i<25;i++)
943 printf("%f ", features[i]);
944 printf("\n");
945 #endif
946
947 info->bandwidth = bandwidth;
948 tonal->prev_bandwidth = bandwidth;
949 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
950 info->noisiness = frame_noisiness;
951 info->valid = 1;
952 RESTORE_STACK;
953 }
954
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)955 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
956 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
957 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
958 {
959 int offset;
960 int pcm_len;
961
962 analysis_frame_size -= analysis_frame_size&1;
963 if (analysis_pcm != NULL)
964 {
965 /* Avoid overflow/wrap-around of the analysis buffer */
966 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
967
968 pcm_len = analysis_frame_size - analysis->analysis_offset;
969 offset = analysis->analysis_offset;
970 while (pcm_len>0) {
971 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
972 offset += Fs/50;
973 pcm_len -= Fs/50;
974 }
975 analysis->analysis_offset = analysis_frame_size;
976
977 analysis->analysis_offset -= frame_size;
978 }
979
980 tonality_get_info(analysis, analysis_info, frame_size);
981 }
982
983 #endif /* DISABLE_FLOAT_API */
984