• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (c) 2014-2020, Cisco Systems, INC
2    Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
3 
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
34 #include <smmintrin.h>
35 
36 #include "main.h"
37 #include "stack_alloc.h"
38 
39 /* Weighting factors for tilt measure */
40 static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
41 
42 /***************************************/
43 /* Get the speech activity level in Q8 */
44 /***************************************/
silk_VAD_GetSA_Q8_sse4_1(silk_encoder_state * psEncC,const opus_int16 pIn[])45 opus_int silk_VAD_GetSA_Q8_sse4_1(                  /* O    Return value, 0 if success                  */
46     silk_encoder_state          *psEncC,            /* I/O  Encoder state                               */
47     const opus_int16            pIn[]               /* I    PCM input                                   */
48 )
49 {
50     opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
51     opus_int   decimated_framelength1, decimated_framelength2;
52     opus_int   decimated_framelength;
53     opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
54     opus_int32 sumSquared, smooth_coef_Q16;
55     opus_int16 HPstateTmp;
56     VARDECL( opus_int16, X );
57     opus_int32 Xnrg[ VAD_N_BANDS ];
58     opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
59     opus_int32 speech_nrg, x_tmp;
60     opus_int   X_offset[ VAD_N_BANDS ];
61     opus_int   ret = 0;
62     silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
63 
64     SAVE_STACK;
65 
66 #ifdef OPUS_CHECK_ASM
67     silk_encoder_state psEncC_c;
68     opus_int ret_c;
69 
70     silk_memcpy( &psEncC_c, psEncC, sizeof( psEncC_c ) );
71     ret_c = silk_VAD_GetSA_Q8_c( &psEncC_c, pIn );
72 #endif
73 
74     /* Safety checks */
75     silk_assert( VAD_N_BANDS == 4 );
76     celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
77     celt_assert( psEncC->frame_length <= 512 );
78     celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
79 
80     /***********************/
81     /* Filter and Decimate */
82     /***********************/
83     decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
84     decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
85     decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
86     /* Decimate into 4 bands:
87        0       L      3L       L              3L                             5L
88                -      --       -              --                             --
89                8       8       2               4                              4
90 
91        [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
92 
93        They're arranged to allow the minimal ( frame_length / 4 ) extra
94        scratch space during the downsampling process */
95     X_offset[ 0 ] = 0;
96     X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
97     X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
98     X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
99     ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
100 
101     /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
102     silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
103         X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
104 
105     /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
106     silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
107         X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
108 
109     /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
110     silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
111         X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
112 
113     /*********************************************/
114     /* HP filter on lowest band (differentiator) */
115     /*********************************************/
116     X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
117     HPstateTmp = X[ decimated_framelength - 1 ];
118     for( i = decimated_framelength - 1; i > 0; i-- ) {
119         X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
120         X[ i ]     -= X[ i - 1 ];
121     }
122     X[ 0 ] -= psSilk_VAD->HPstate;
123     psSilk_VAD->HPstate = HPstateTmp;
124 
125     /*************************************/
126     /* Calculate the energy in each band */
127     /*************************************/
128     for( b = 0; b < VAD_N_BANDS; b++ ) {
129         /* Find the decimated framelength in the non-uniformly divided bands */
130         decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
131 
132         /* Split length into subframe lengths */
133         dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
134         dec_subframe_offset = 0;
135 
136         /* Compute energy per sub-frame */
137         /* initialize with summed energy of last subframe */
138         Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
139         for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
140             __m128i xmm_X, xmm_acc;
141             sumSquared = 0;
142 
143             xmm_acc = _mm_setzero_si128();
144 
145             for( i = 0; i < dec_subframe_length - 7; i += 8 )
146             {
147                 xmm_X   = _mm_loadu_si128( (__m128i *)&(X[ X_offset[ b ] + i + dec_subframe_offset ] ) );
148                 xmm_X   = _mm_srai_epi16( xmm_X, 3 );
149                 xmm_X   = _mm_madd_epi16( xmm_X, xmm_X );
150                 xmm_acc = _mm_add_epi32( xmm_acc, xmm_X );
151             }
152 
153             xmm_acc = _mm_add_epi32( xmm_acc, _mm_unpackhi_epi64( xmm_acc, xmm_acc ) );
154             xmm_acc = _mm_add_epi32( xmm_acc, _mm_shufflelo_epi16( xmm_acc, 0x0E ) );
155 
156             sumSquared += _mm_cvtsi128_si32( xmm_acc );
157 
158             for( ; i < dec_subframe_length; i++ ) {
159                 /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
160                 /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
161                 x_tmp = silk_RSHIFT(
162                     X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
163                 sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
164 
165                 /* Safety check */
166                 silk_assert( sumSquared >= 0 );
167             }
168 
169             /* Add/saturate summed energy of current subframe */
170             if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
171                 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
172             } else {
173                 /* Look-ahead subframe */
174                 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
175             }
176 
177             dec_subframe_offset += dec_subframe_length;
178         }
179         psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
180     }
181 
182     /********************/
183     /* Noise estimation */
184     /********************/
185     silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
186 
187     /***********************************************/
188     /* Signal-plus-noise to noise ratio estimation */
189     /***********************************************/
190     sumSquared = 0;
191     input_tilt = 0;
192     for( b = 0; b < VAD_N_BANDS; b++ ) {
193         speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
194         if( speech_nrg > 0 ) {
195             /* Divide, with sufficient resolution */
196             if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
197                 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
198             } else {
199                 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
200             }
201 
202             /* Convert to log domain */
203             SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
204 
205             /* Sum-of-squares */
206             sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
207 
208             /* Tilt measure */
209             if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
210                 /* Scale down SNR value for small subband speech energies */
211                 SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
212             }
213             input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
214         } else {
215             NrgToNoiseRatio_Q8[ b ] = 256;
216         }
217     }
218 
219     /* Mean-of-squares */
220     sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
221 
222     /* Root-mean-square approximation, scale to dBs, and write to output pointer */
223     pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
224 
225     /*********************************/
226     /* Speech Probability Estimation */
227     /*********************************/
228     SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
229 
230     /**************************/
231     /* Frequency Tilt Measure */
232     /**************************/
233     psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
234 
235     /**************************************************/
236     /* Scale the sigmoid output based on power levels */
237     /**************************************************/
238     speech_nrg = 0;
239     for( b = 0; b < VAD_N_BANDS; b++ ) {
240         /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
241         speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
242     }
243 
244     if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
245         speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
246     }
247     /* Power scaling */
248     if( speech_nrg <= 0 ) {
249         SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
250     } else if( speech_nrg < 16384 ) {
251         speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
252 
253         /* square-root */
254         speech_nrg = silk_SQRT_APPROX( speech_nrg );
255         SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
256     }
257 
258     /* Copy the resulting speech activity in Q8 */
259     psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
260 
261     /***********************************/
262     /* Energy Level and SNR estimation */
263     /***********************************/
264     /* Smoothing coefficient */
265     smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
266 
267     if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
268         smooth_coef_Q16 >>= 1;
269     }
270 
271     for( b = 0; b < VAD_N_BANDS; b++ ) {
272         /* compute smoothed energy-to-noise ratio per band */
273         psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
274             NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
275 
276         /* signal to noise ratio in dB per band */
277         SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
278         /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
279         psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
280     }
281 
282 #ifdef OPUS_CHECK_ASM
283     silk_assert( ret == ret_c );
284     silk_assert( !memcmp( &psEncC_c, psEncC, sizeof( psEncC_c ) ) );
285 #endif
286 
287     RESTORE_STACK;
288     return( ret );
289 }
290