1 /* Copyright (c) 2014-2020, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
34 #include <smmintrin.h>
35
36 #include "SigProc_FIX.h"
37 #include "define.h"
38 #include "tuning_parameters.h"
39 #include "pitch.h"
40 #include "celt/x86/x86cpu.h"
41
42 #define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
43
44 #define QA 25
45 #define N_BITS_HEAD_ROOM 3
46 #define MIN_RSHIFTS -16
47 #define MAX_RSHIFTS (32 - QA)
48
49 /* Compute reflection coefficients from input signal */
silk_burg_modified_sse4_1(opus_int32 * res_nrg,opus_int * res_nrg_Q,opus_int32 A_Q16[],const opus_int16 x[],const opus_int32 minInvGain_Q30,const opus_int subfr_length,const opus_int nb_subfr,const opus_int D,int arch)50 void silk_burg_modified_sse4_1(
51 opus_int32 *res_nrg, /* O Residual energy */
52 opus_int *res_nrg_Q, /* O Residual energy Q value */
53 opus_int32 A_Q16[], /* O Prediction coefficients (length order) */
54 const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */
55 const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */
56 const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */
57 const opus_int nb_subfr, /* I Number of subframes stacked in x */
58 const opus_int D, /* I Order */
59 int arch /* I Run-time architecture */
60 )
61 {
62 opus_int k, n, s, lz, rshifts, reached_max_gain;
63 opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
64 const opus_int16 *x_ptr;
65 opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ];
66 opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ];
67 opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ];
68 opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ];
69 opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ];
70 opus_int32 xcorr[ SILK_MAX_ORDER_LPC ];
71 opus_int64 C0_64;
72
73 __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
74 __m128i CONST1 = _mm_set1_epi32(1);
75
76 celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
77
78 /* Compute autocorrelations, added over subframes */
79 C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
80 lz = silk_CLZ64(C0_64);
81 rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
82 if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
83 if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
84
85 if (rshifts > 0) {
86 C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
87 } else {
88 C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
89 }
90
91 CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
92 silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
93 if( rshifts > 0 ) {
94 for( s = 0; s < nb_subfr; s++ ) {
95 x_ptr = x + s * subfr_length;
96 for( n = 1; n < D + 1; n++ ) {
97 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
98 silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
99 }
100 }
101 } else {
102 for( s = 0; s < nb_subfr; s++ ) {
103 int i;
104 opus_int32 d;
105 x_ptr = x + s * subfr_length;
106 celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );
107 for( n = 1; n < D + 1; n++ ) {
108 for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )
109 d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );
110 xcorr[ n - 1 ] += d;
111 }
112 for( n = 1; n < D + 1; n++ ) {
113 C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts );
114 }
115 }
116 }
117 silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
118
119 /* Initialize */
120 CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */
121
122 invGain_Q30 = (opus_int32)1 << 30;
123 reached_max_gain = 0;
124 for( n = 0; n < D; n++ ) {
125 /* Update first row of correlation matrix (without first element) */
126 /* Update last row of correlation matrix (without last element, stored in reversed order) */
127 /* Update C * Af */
128 /* Update C * flipud(Af) (stored in reversed order) */
129 if( rshifts > -2 ) {
130 for( s = 0; s < nb_subfr; s++ ) {
131 x_ptr = x + s * subfr_length;
132 x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], 16 - rshifts ); /* Q(16-rshifts) */
133 x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts ); /* Q(16-rshifts) */
134 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], QA - 16 ); /* Q(QA-16) */
135 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 ); /* Q(QA-16) */
136 for( k = 0; k < n; k++ ) {
137 C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
138 C_last_row[ k ] = silk_SMLAWB( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
139 Atmp_QA = Af_QA[ k ];
140 tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ] ); /* Q(QA-16) */
141 tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] ); /* Q(QA-16) */
142 }
143 tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts ); /* Q(16-rshifts) */
144 tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts ); /* Q(16-rshifts) */
145 for( k = 0; k <= n; k++ ) {
146 CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ] ); /* Q( -rshift ) */
147 CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] ); /* Q( -rshift ) */
148 }
149 }
150 } else {
151 for( s = 0; s < nb_subfr; s++ ) {
152 x_ptr = x + s * subfr_length;
153 x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], -rshifts ); /* Q( -rshifts ) */
154 x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts ); /* Q( -rshifts ) */
155 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], 17 ); /* Q17 */
156 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 ); /* Q17 */
157
158 X1_3210 = _mm_set1_epi32( x1 );
159 X2_3210 = _mm_set1_epi32( x2 );
160 TMP1_3210 = _mm_setzero_si128();
161 TMP2_3210 = _mm_setzero_si128();
162 for( k = 0; k < n - 3; k += 4 ) {
163 PTR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] );
164 SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] );
165 FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] );
166 PTR_3210 = _mm_shuffle_epi32( PTR_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) );
167 LAST_3210 = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] );
168 ATMP_3210 = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] );
169
170 T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 );
171 T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 );
172
173 ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 );
174 ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 );
175 ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 );
176
177 FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 );
178 LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 );
179
180 PTR_3210 = _mm_mullo_epi32( ATMP_3210, PTR_3210 );
181 SUBFR_3210 = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 );
182
183 _mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 );
184 _mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 );
185
186 TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 );
187 TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 );
188 }
189
190 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) );
191 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) );
192 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) );
193 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) );
194
195 tmp1 += _mm_cvtsi128_si32( TMP1_3210 );
196 tmp2 += _mm_cvtsi128_si32( TMP2_3210 );
197
198 for( ; k < n; k++ ) {
199 C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */
200 C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
201 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */
202 /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
203 but they cancel each other and the real result seems to always fit in a 32-bit
204 signed integer. This was determined experimentally, not theoretically (unfortunately). */
205 tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */
206 tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */
207 }
208
209 tmp1 = -tmp1; /* Q17 */
210 tmp2 = -tmp2; /* Q17 */
211
212 {
213 __m128i xmm_tmp1, xmm_tmp2;
214 __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
215 __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;
216
217 xmm_tmp1 = _mm_set1_epi32( tmp1 );
218 xmm_tmp2 = _mm_set1_epi32( tmp2 );
219
220 for( k = 0; k <= n - 3; k += 4 ) {
221 xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] );
222 xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] );
223
224 xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) );
225
226 xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 );
227 xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 );
228
229 /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
230 xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
231 xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
232
233 xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 );
234 xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 );
235 xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 );
236 xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 );
237
238 xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 );
239 xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 );
240 xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 );
241 xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 );
242
243 xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC );
244 xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC );
245
246 X1_3210 = _mm_loadu_si128( (__m128i *)&CAf[ k ] );
247 PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] );
248
249 X1_3210 = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 );
250 PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 );
251
252 _mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 );
253 _mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 );
254 }
255
256 for( ; k <= n; k++ ) {
257 CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
258 silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) ); /* Q( -rshift ) */
259 CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
260 silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
261 }
262 }
263 }
264 }
265
266 /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
267 tmp1 = C_first_row[ n ]; /* Q( -rshifts ) */
268 tmp2 = C_last_row[ n ]; /* Q( -rshifts ) */
269 num = 0; /* Q( -rshifts ) */
270 nrg = silk_ADD32( CAb[ 0 ], CAf[ 0 ] ); /* Q( 1-rshifts ) */
271 for( k = 0; k < n; k++ ) {
272 Atmp_QA = Af_QA[ k ];
273 lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1;
274 lz = silk_min( 32 - QA, lz );
275 Atmp1 = silk_LSHIFT32( Atmp_QA, lz ); /* Q( QA + lz ) */
276
277 tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
278 tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
279 num = silk_ADD_LSHIFT32( num, silk_SMMUL( CAb[ n - k ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */
280 nrg = silk_ADD_LSHIFT32( nrg, silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ),
281 Atmp1 ), 32 - QA - lz ); /* Q( 1-rshifts ) */
282 }
283 CAf[ n + 1 ] = tmp1; /* Q( -rshifts ) */
284 CAb[ n + 1 ] = tmp2; /* Q( -rshifts ) */
285 num = silk_ADD32( num, tmp2 ); /* Q( -rshifts ) */
286 num = silk_LSHIFT32( -num, 1 ); /* Q( 1-rshifts ) */
287
288 /* Calculate the next order reflection (parcor) coefficient */
289 if( silk_abs( num ) < nrg ) {
290 rc_Q31 = silk_DIV32_varQ( num, nrg, 31 );
291 } else {
292 rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN;
293 }
294
295 /* Update inverse prediction gain */
296 tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
297 tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 );
298 if( tmp1 <= minInvGain_Q30 ) {
299 /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
300 tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 ); /* Q30 */
301 rc_Q31 = silk_SQRT_APPROX( tmp2 ); /* Q15 */
302 if( rc_Q31 > 0 ) {
303 /* Newton-Raphson iteration */
304 rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 ); /* Q15 */
305 rc_Q31 = silk_LSHIFT32( rc_Q31, 16 ); /* Q31 */
306 if( num < 0 ) {
307 /* Ensure adjusted reflection coefficients has the original sign */
308 rc_Q31 = -rc_Q31;
309 }
310 }
311 invGain_Q30 = minInvGain_Q30;
312 reached_max_gain = 1;
313 } else {
314 invGain_Q30 = tmp1;
315 }
316
317 /* Update the AR coefficients */
318 for( k = 0; k < (n + 1) >> 1; k++ ) {
319 tmp1 = Af_QA[ k ]; /* QA */
320 tmp2 = Af_QA[ n - k - 1 ]; /* QA */
321 Af_QA[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* QA */
322 Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* QA */
323 }
324 Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA ); /* QA */
325
326 if( reached_max_gain ) {
327 /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
328 for( k = n + 1; k < D; k++ ) {
329 Af_QA[ k ] = 0;
330 }
331 break;
332 }
333
334 /* Update C * Af and C * Ab */
335 for( k = 0; k <= n + 1; k++ ) {
336 tmp1 = CAf[ k ]; /* Q( -rshifts ) */
337 tmp2 = CAb[ n - k + 1 ]; /* Q( -rshifts ) */
338 CAf[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* Q( -rshifts ) */
339 CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* Q( -rshifts ) */
340 }
341 }
342
343 if( reached_max_gain ) {
344 for( k = 0; k < D; k++ ) {
345 /* Scale coefficients */
346 A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
347 }
348 /* Subtract energy of preceding samples from C0 */
349 if( rshifts > 0 ) {
350 for( s = 0; s < nb_subfr; s++ ) {
351 x_ptr = x + s * subfr_length;
352 C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
353 }
354 } else {
355 for( s = 0; s < nb_subfr; s++ ) {
356 x_ptr = x + s * subfr_length;
357 C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts );
358 }
359 }
360 /* Approximate residual energy */
361 *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 );
362 *res_nrg_Q = -rshifts;
363 } else {
364 /* Return residual energy */
365 nrg = CAf[ 0 ]; /* Q( -rshifts ) */
366 tmp1 = (opus_int32)1 << 16; /* Q16 */
367 for( k = 0; k < D; k++ ) {
368 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); /* Q16 */
369 nrg = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 ); /* Q( -rshifts ) */
370 tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 ); /* Q16 */
371 A_Q16[ k ] = -Atmp1;
372 }
373 *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
374 *res_nrg_Q = -rshifts;
375 }
376
377 #ifdef OPUS_CHECK_ASM
378 {
379 opus_int32 res_nrg_c = 0;
380 opus_int res_nrg_Q_c = 0;
381 opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};
382
383 silk_burg_modified_c(
384 &res_nrg_c,
385 &res_nrg_Q_c,
386 A_Q16_c,
387 x,
388 minInvGain_Q30,
389 subfr_length,
390 nb_subfr,
391 D,
392 0
393 );
394
395 silk_assert( *res_nrg == res_nrg_c );
396 silk_assert( *res_nrg_Q == res_nrg_Q_c );
397 silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );
398 }
399 #endif
400 }
401