1 /* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
34 #include <smmintrin.h>
35
36 #include "SigProc_FIX.h"
37 #include "define.h"
38 #include "tuning_parameters.h"
39 #include "pitch.h"
40 #include "celt/x86/x86cpu.h"
41
42 #define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
43
44 #define QA 25
45 #define N_BITS_HEAD_ROOM 2
46 #define MIN_RSHIFTS -16
47 #define MAX_RSHIFTS (32 - QA)
48
49 /* Compute reflection coefficients from input signal */
silk_burg_modified_sse4_1(opus_int32 * res_nrg,opus_int * res_nrg_Q,opus_int32 A_Q16[],const opus_int16 x[],const opus_int32 minInvGain_Q30,const opus_int subfr_length,const opus_int nb_subfr,const opus_int D,int arch)50 void silk_burg_modified_sse4_1(
51 opus_int32 *res_nrg, /* O Residual energy */
52 opus_int *res_nrg_Q, /* O Residual energy Q value */
53 opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
54 const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */
55 const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
56 const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
57 const opus_int nb_subfr, /* I Number of subframes stacked in x */
58 const opus_int D, /* I Order */
59 int arch /* I Run-time architecture */
60 )
61 {
62 opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
63 opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
64 const opus_int16 *x_ptr;
65 opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ];
66 opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ];
67 opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ];
68 opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ];
69 opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ];
70 opus_int32 xcorr[ SILK_MAX_ORDER_LPC ];
71
72 __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
73 __m128i CONST1 = _mm_set1_epi32(1);
74
75 celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
76
77 /* Compute autocorrelations, added over subframes */
78 silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length );
79 if( rshifts > MAX_RSHIFTS ) {
80 C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS );
81 silk_assert( C0 > 0 );
82 rshifts = MAX_RSHIFTS;
83 } else {
84 lz = silk_CLZ32( C0 ) - 1;
85 rshifts_extra = N_BITS_HEAD_ROOM - lz;
86 if( rshifts_extra > 0 ) {
87 rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts );
88 C0 = silk_RSHIFT32( C0, rshifts_extra );
89 } else {
90 rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts );
91 C0 = silk_LSHIFT32( C0, -rshifts_extra );
92 }
93 rshifts += rshifts_extra;
94 }
95 CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
96 silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
97 if( rshifts > 0 ) {
98 for( s = 0; s < nb_subfr; s++ ) {
99 x_ptr = x + s * subfr_length;
100 for( n = 1; n < D + 1; n++ ) {
101 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
102 silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
103 }
104 }
105 } else {
106 for( s = 0; s < nb_subfr; s++ ) {
107 int i;
108 opus_int32 d;
109 x_ptr = x + s * subfr_length;
110 celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );
111 for( n = 1; n < D + 1; n++ ) {
112 for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )
113 d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );
114 xcorr[ n - 1 ] += d;
115 }
116 for( n = 1; n < D + 1; n++ ) {
117 C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts );
118 }
119 }
120 }
121 silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
122
123 /* Initialize */
124 CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
125
126 invGain_Q30 = (opus_int32)1 << 30;
127 reached_max_gain = 0;
128 for( n = 0; n < D; n++ ) {
129 /* Update first row of correlation matrix (without first element) */
130 /* Update last row of correlation matrix (without last element, stored in reversed order) */
131 /* Update C * Af */
132 /* Update C * flipud(Af) (stored in reversed order) */
133 if( rshifts > -2 ) {
134 for( s = 0; s < nb_subfr; s++ ) {
135 x_ptr = x + s * subfr_length;
136 x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], 16 - rshifts ); /* Q(16-rshifts) */
137 x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts ); /* Q(16-rshifts) */
138 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], QA - 16 ); /* Q(QA-16) */
139 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 ); /* Q(QA-16) */
140 for( k = 0; k < n; k++ ) {
141 C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
142 C_last_row[ k ] = silk_SMLAWB( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
143 Atmp_QA = Af_QA[ k ];
144 tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ] ); /* Q(QA-16) */
145 tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] ); /* Q(QA-16) */
146 }
147 tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts ); /* Q(16-rshifts) */
148 tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts ); /* Q(16-rshifts) */
149 for( k = 0; k <= n; k++ ) {
150 CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ] ); /* Q( -rshift ) */
151 CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] ); /* Q( -rshift ) */
152 }
153 }
154 } else {
155 for( s = 0; s < nb_subfr; s++ ) {
156 x_ptr = x + s * subfr_length;
157 x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], -rshifts ); /* Q( -rshifts ) */
158 x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts ); /* Q( -rshifts ) */
159 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], 17 ); /* Q17 */
160 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 ); /* Q17 */
161
162 X1_3210 = _mm_set1_epi32( x1 );
163 X2_3210 = _mm_set1_epi32( x2 );
164 TMP1_3210 = _mm_setzero_si128();
165 TMP2_3210 = _mm_setzero_si128();
166 for( k = 0; k < n - 3; k += 4 ) {
167 PTR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] );
168 SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] );
169 FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] );
170 PTR_3210 = _mm_shuffle_epi32( PTR_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
171 LAST_3210 = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] );
172 ATMP_3210 = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] );
173
174 T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 );
175 T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 );
176
177 ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 );
178 ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 );
179 ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 );
180
181 FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 );
182 LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 );
183
184 PTR_3210 = _mm_mullo_epi32( ATMP_3210, PTR_3210 );
185 SUBFR_3210 = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 );
186
187 _mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 );
188 _mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 );
189
190 TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 );
191 TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 );
192 }
193
194 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) );
195 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) );
196 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) );
197 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) );
198
199 tmp1 += _mm_cvtsi128_si32( TMP1_3210 );
200 tmp2 += _mm_cvtsi128_si32( TMP2_3210 );
201
202 for( ; k < n; k++ ) {
203 C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
204 C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
205 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
206 tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
207 tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
208 }
209
210 tmp1 = -tmp1; /* Q17 */
211 tmp2 = -tmp2; /* Q17 */
212
213 {
214 __m128i xmm_tmp1, xmm_tmp2;
215 __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
216 __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;
217
218 xmm_tmp1 = _mm_set1_epi32( tmp1 );
219 xmm_tmp2 = _mm_set1_epi32( tmp2 );
220
221 for( k = 0; k <= n - 3; k += 4 ) {
222 xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] );
223 xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] );
224
225 xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) );
226
227 xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 );
228 xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 );
229
230 /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
231 xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
232 xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
233
234 xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 );
235 xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 );
236 xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 );
237 xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 );
238
239 xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 );
240 xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 );
241 xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 );
242 xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 );
243
244 xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC );
245 xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC );
246
247 X1_3210 = _mm_loadu_si128( (__m128i *)&CAf[ k ] );
248 PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] );
249
250 X1_3210 = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 );
251 PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 );
252
253 _mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 );
254 _mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 );
255 }
256
257 for( ; k <= n; k++ ) {
258 CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
259 silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) ); /* Q( -rshift ) */
260 CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
261 silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
262 }
263 }
264 }
265 }
266
267 /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
268 tmp1 = C_first_row[ n ]; /* Q( -rshifts ) */
269 tmp2 = C_last_row[ n ]; /* Q( -rshifts ) */
270 num = 0; /* Q( -rshifts ) */
271 nrg = silk_ADD32( CAb[ 0 ], CAf[ 0 ] ); /* Q( 1-rshifts ) */
272 for( k = 0; k < n; k++ ) {
273 Atmp_QA = Af_QA[ k ];
274 lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1;
275 lz = silk_min( 32 - QA, lz );
276 Atmp1 = silk_LSHIFT32( Atmp_QA, lz ); /* Q( QA + lz ) */
277
278 tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
279 tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
280 num = silk_ADD_LSHIFT32( num, silk_SMMUL( CAb[ n - k ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
281 nrg = silk_ADD_LSHIFT32( nrg, silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ),
282 Atmp1 ), 32 - QA - lz ); /* Q( 1-rshifts ) */
283 }
284 CAf[ n + 1 ] = tmp1; /* Q( -rshifts ) */
285 CAb[ n + 1 ] = tmp2; /* Q( -rshifts ) */
286 num = silk_ADD32( num, tmp2 ); /* Q( -rshifts ) */
287 num = silk_LSHIFT32( -num, 1 ); /* Q( 1-rshifts ) */
288
289 /* Calculate the next order reflection (parcor) coefficient */
290 if( silk_abs( num ) < nrg ) {
291 rc_Q31 = silk_DIV32_varQ( num, nrg, 31 );
292 } else {
293 rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN;
294 }
295
296 /* Update inverse prediction gain */
297 tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
298 tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 );
299 if( tmp1 <= minInvGain_Q30 ) {
300 /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
301 tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 ); /* Q30 */
302 rc_Q31 = silk_SQRT_APPROX( tmp2 ); /* Q15 */
303 if( rc_Q31 > 0 ) {
304 /* Newton-Raphson iteration */
305 rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 ); /* Q15 */
306 rc_Q31 = silk_LSHIFT32( rc_Q31, 16 ); /* Q31 */
307 if( num < 0 ) {
308 /* Ensure adjusted reflection coefficients has the original sign */
309 rc_Q31 = -rc_Q31;
310 }
311 }
312 invGain_Q30 = minInvGain_Q30;
313 reached_max_gain = 1;
314 } else {
315 invGain_Q30 = tmp1;
316 }
317
318 /* Update the AR coefficients */
319 for( k = 0; k < (n + 1) >> 1; k++ ) {
320 tmp1 = Af_QA[ k ]; /* QA */
321 tmp2 = Af_QA[ n - k - 1 ]; /* QA */
322 Af_QA[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* QA */
323 Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* QA */
324 }
325 Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA ); /* QA */
326
327 if( reached_max_gain ) {
328 /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
329 for( k = n + 1; k < D; k++ ) {
330 Af_QA[ k ] = 0;
331 }
332 break;
333 }
334
335 /* Update C * Af and C * Ab */
336 for( k = 0; k <= n + 1; k++ ) {
337 tmp1 = CAf[ k ]; /* Q( -rshifts ) */
338 tmp2 = CAb[ n - k + 1 ]; /* Q( -rshifts ) */
339 CAf[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* Q( -rshifts ) */
340 CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* Q( -rshifts ) */
341 }
342 }
343
344 if( reached_max_gain ) {
345 for( k = 0; k < D; k++ ) {
346 /* Scale coefficients */
347 A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
348 }
349 /* Subtract energy of preceding samples from C0 */
350 if( rshifts > 0 ) {
351 for( s = 0; s < nb_subfr; s++ ) {
352 x_ptr = x + s * subfr_length;
353 C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x_ptr, x_ptr, D, arch ), rshifts );
354 }
355 } else {
356 for( s = 0; s < nb_subfr; s++ ) {
357 x_ptr = x + s * subfr_length;
358 C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts );
359 }
360 }
361 /* Approximate residual energy */
362 *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 );
363 *res_nrg_Q = -rshifts;
364 } else {
365 /* Return residual energy */
366 nrg = CAf[ 0 ]; /* Q( -rshifts ) */
367 tmp1 = (opus_int32)1 << 16; /* Q16 */
368 for( k = 0; k < D; k++ ) {
369 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); /* Q16 */
370 nrg = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 ); /* Q( -rshifts ) */
371 tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 ); /* Q16 */
372 A_Q16[ k ] = -Atmp1;
373 }
374 *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
375 *res_nrg_Q = -rshifts;
376 }
377 }
378