• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (c) 2014-2020, Cisco Systems, INC
2    Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
3 
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
34 #include <smmintrin.h>
35 
36 #include "SigProc_FIX.h"
37 #include "define.h"
38 #include "tuning_parameters.h"
39 #include "pitch.h"
40 #include "celt/x86/x86cpu.h"
41 
42 #define MAX_FRAME_SIZE              384             /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
43 
44 #define QA                          25
45 #define N_BITS_HEAD_ROOM            3
46 #define MIN_RSHIFTS                 -16
47 #define MAX_RSHIFTS                 (32 - QA)
48 
49 /* Compute reflection coefficients from input signal */
silk_burg_modified_sse4_1(opus_int32 * res_nrg,opus_int * res_nrg_Q,opus_int32 A_Q16[],const opus_int16 x[],const opus_int32 minInvGain_Q30,const opus_int subfr_length,const opus_int nb_subfr,const opus_int D,int arch)50 void silk_burg_modified_sse4_1(
51     opus_int32                  *res_nrg,           /* O    Residual energy                                             */
52     opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
53     opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
54     const opus_int16            x[],                /* I    Input signal, length: nb_subfr * ( D + subfr_length )       */
55     const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
56     const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
57     const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
58     const opus_int              D,                  /* I    Order                                                       */
59     int                         arch                /* I    Run-time architecture                                       */
60 )
61 {
62     opus_int         k, n, s, lz, rshifts, reached_max_gain;
63     opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
64     const opus_int16 *x_ptr;
65     opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
66     opus_int32       C_last_row[  SILK_MAX_ORDER_LPC ];
67     opus_int32       Af_QA[       SILK_MAX_ORDER_LPC ];
68     opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
69     opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
70     opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
71     opus_int64       C0_64;
72 
73     __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
74     __m128i CONST1 = _mm_set1_epi32(1);
75 
76     celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
77 
78     /* Compute autocorrelations, added over subframes */
79     C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
80     lz = silk_CLZ64(C0_64);
81     rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
82     if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
83     if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
84 
85     if (rshifts > 0) {
86         C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
87     } else {
88         C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
89     }
90 
91     CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
92     silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
93     if( rshifts > 0 ) {
94         for( s = 0; s < nb_subfr; s++ ) {
95             x_ptr = x + s * subfr_length;
96             for( n = 1; n < D + 1; n++ ) {
97                 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
98                     silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
99             }
100         }
101     } else {
102         for( s = 0; s < nb_subfr; s++ ) {
103             int i;
104             opus_int32 d;
105             x_ptr = x + s * subfr_length;
106             celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );
107             for( n = 1; n < D + 1; n++ ) {
108                for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )
109                   d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );
110                xcorr[ n - 1 ] += d;
111             }
112             for( n = 1; n < D + 1; n++ ) {
113                 C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts );
114             }
115         }
116     }
117     silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
118 
119     /* Initialize */
120     CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
121 
122     invGain_Q30 = (opus_int32)1 << 30;
123     reached_max_gain = 0;
124     for( n = 0; n < D; n++ ) {
125         /* Update first row of correlation matrix (without first element) */
126         /* Update last row of correlation matrix (without last element, stored in reversed order) */
127         /* Update C * Af */
128         /* Update C * flipud(Af) (stored in reversed order) */
129         if( rshifts > -2 ) {
130             for( s = 0; s < nb_subfr; s++ ) {
131                 x_ptr = x + s * subfr_length;
132                 x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    16 - rshifts );        /* Q(16-rshifts) */
133                 x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts );        /* Q(16-rshifts) */
134                 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    QA - 16 );             /* Q(QA-16) */
135                 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 );             /* Q(QA-16) */
136                 for( k = 0; k < n; k++ ) {
137                     C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
138                     C_last_row[ k ]  = silk_SMLAWB( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
139                     Atmp_QA = Af_QA[ k ];
140                     tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ]            );                 /* Q(QA-16) */
141                     tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] );                 /* Q(QA-16) */
142                 }
143                 tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
144                 tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
145                 for( k = 0; k <= n; k++ ) {
146                     CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ]                    );        /* Q( -rshift ) */
147                     CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] );        /* Q( -rshift ) */
148                 }
149             }
150         } else {
151             for( s = 0; s < nb_subfr; s++ ) {
152                 x_ptr = x + s * subfr_length;
153                 x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    -rshifts );            /* Q( -rshifts ) */
154                 x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts );            /* Q( -rshifts ) */
155                 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    17 );                  /* Q17 */
156                 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 );                  /* Q17 */
157 
158                 X1_3210 = _mm_set1_epi32( x1 );
159                 X2_3210 = _mm_set1_epi32( x2 );
160                 TMP1_3210 = _mm_setzero_si128();
161                 TMP2_3210 = _mm_setzero_si128();
162                 for( k = 0; k < n - 3; k += 4 ) {
163                     PTR_3210   = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] );
164                     SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] );
165                     FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] );
166                     PTR_3210   = _mm_shuffle_epi32( PTR_3210,  _MM_SHUFFLE( 0, 1, 2, 3 ) );
167                     LAST_3210  = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] );
168                     ATMP_3210  = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] );
169 
170                     T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 );
171                     T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 );
172 
173                     ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 );
174                     ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 );
175                     ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 );
176 
177                     FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 );
178                     LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 );
179 
180                     PTR_3210   = _mm_mullo_epi32( ATMP_3210, PTR_3210 );
181                     SUBFR_3210   = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 );
182 
183                     _mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 );
184                     _mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 );
185 
186                     TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 );
187                     TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 );
188                 }
189 
190                 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) );
191                 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) );
192                 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) );
193                 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) );
194 
195                 tmp1 += _mm_cvtsi128_si32( TMP1_3210 );
196                 tmp2 += _mm_cvtsi128_si32( TMP2_3210 );
197 
198                 for( ; k < n; k++ ) {
199                     C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
200                     C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
201                     Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
202                     /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
203                        but they cancel each other and the real result seems to always fit in a 32-bit
204                        signed integer. This was determined experimentally, not theoretically (unfortunately). */
205                     tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
206                     tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
207                 }
208 
209                 tmp1 = -tmp1;                /* Q17 */
210                 tmp2 = -tmp2;                /* Q17 */
211 
212                 {
213                     __m128i xmm_tmp1, xmm_tmp2;
214                     __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
215                     __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;
216 
217                     xmm_tmp1 = _mm_set1_epi32( tmp1 );
218                     xmm_tmp2 = _mm_set1_epi32( tmp2 );
219 
220                     for( k = 0; k <= n - 3; k += 4 ) {
221                         xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] );
222                         xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] );
223 
224                         xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) );
225 
226                         xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 );
227                         xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 );
228 
229                         /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
230                         xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
231                         xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
232 
233                         xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 );
234                         xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 );
235                         xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 );
236                         xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 );
237 
238                         xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 );
239                         xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 );
240                         xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 );
241                         xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 );
242 
243                         xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC );
244                         xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC );
245 
246                         X1_3210  = _mm_loadu_si128( (__m128i *)&CAf[ k ] );
247                         PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] );
248 
249                         X1_3210  = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 );
250                         PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 );
251 
252                         _mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 );
253                         _mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 );
254                     }
255 
256                     for( ; k <= n; k++ ) {
257                         CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
258                             silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) );                    /* Q( -rshift ) */
259                         CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
260                             silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
261                     }
262                 }
263             }
264         }
265 
266         /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
267         tmp1 = C_first_row[ n ];                                                                        /* Q( -rshifts ) */
268         tmp2 = C_last_row[ n ];                                                                         /* Q( -rshifts ) */
269         num  = 0;                                                                                       /* Q( -rshifts ) */
270         nrg  = silk_ADD32( CAb[ 0 ], CAf[ 0 ] );                                                        /* Q( 1-rshifts ) */
271         for( k = 0; k < n; k++ ) {
272             Atmp_QA = Af_QA[ k ];
273             lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1;
274             lz = silk_min( 32 - QA, lz );
275             Atmp1 = silk_LSHIFT32( Atmp_QA, lz );                                                       /* Q( QA + lz ) */
276 
277             tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[  n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
278             tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
279             num  = silk_ADD_LSHIFT32( num,  silk_SMMUL( CAb[ n - k ],             Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
280             nrg  = silk_ADD_LSHIFT32( nrg,  silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ),
281                                                                                 Atmp1 ), 32 - QA - lz );    /* Q( 1-rshifts ) */
282         }
283         CAf[ n + 1 ] = tmp1;                                                                            /* Q( -rshifts ) */
284         CAb[ n + 1 ] = tmp2;                                                                            /* Q( -rshifts ) */
285         num = silk_ADD32( num, tmp2 );                                                                  /* Q( -rshifts ) */
286         num = silk_LSHIFT32( -num, 1 );                                                                 /* Q( 1-rshifts ) */
287 
288         /* Calculate the next order reflection (parcor) coefficient */
289         if( silk_abs( num ) < nrg ) {
290             rc_Q31 = silk_DIV32_varQ( num, nrg, 31 );
291         } else {
292             rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN;
293         }
294 
295         /* Update inverse prediction gain */
296         tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
297         tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 );
298         if( tmp1 <= minInvGain_Q30 ) {
299             /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
300             tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 );            /* Q30 */
301             rc_Q31 = silk_SQRT_APPROX( tmp2 );                                                  /* Q15 */
302             if( rc_Q31 > 0 ) {
303                  /* Newton-Raphson iteration */
304                 rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 );                   /* Q15 */
305                 rc_Q31 = silk_LSHIFT32( rc_Q31, 16 );                                               /* Q31 */
306                 if( num < 0 ) {
307                     /* Ensure adjusted reflection coefficients has the original sign */
308                     rc_Q31 = -rc_Q31;
309                 }
310             }
311             invGain_Q30 = minInvGain_Q30;
312             reached_max_gain = 1;
313         } else {
314             invGain_Q30 = tmp1;
315         }
316 
317         /* Update the AR coefficients */
318         for( k = 0; k < (n + 1) >> 1; k++ ) {
319             tmp1 = Af_QA[ k ];                                                                  /* QA */
320             tmp2 = Af_QA[ n - k - 1 ];                                                          /* QA */
321             Af_QA[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );      /* QA */
322             Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );      /* QA */
323         }
324         Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA );                                          /* QA */
325 
326         if( reached_max_gain ) {
327             /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
328             for( k = n + 1; k < D; k++ ) {
329                 Af_QA[ k ] = 0;
330             }
331             break;
332         }
333 
334         /* Update C * Af and C * Ab */
335         for( k = 0; k <= n + 1; k++ ) {
336             tmp1 = CAf[ k ];                                                                    /* Q( -rshifts ) */
337             tmp2 = CAb[ n - k + 1 ];                                                            /* Q( -rshifts ) */
338             CAf[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );        /* Q( -rshifts ) */
339             CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );        /* Q( -rshifts ) */
340         }
341     }
342 
343     if( reached_max_gain ) {
344         for( k = 0; k < D; k++ ) {
345             /* Scale coefficients */
346             A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
347         }
348         /* Subtract energy of preceding samples from C0 */
349         if( rshifts > 0 ) {
350             for( s = 0; s < nb_subfr; s++ ) {
351                 x_ptr = x + s * subfr_length;
352                 C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
353             }
354         } else {
355             for( s = 0; s < nb_subfr; s++ ) {
356                 x_ptr = x + s * subfr_length;
357                 C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts );
358             }
359         }
360         /* Approximate residual energy */
361         *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 );
362         *res_nrg_Q = -rshifts;
363     } else {
364         /* Return residual energy */
365         nrg  = CAf[ 0 ];                                                                            /* Q( -rshifts ) */
366         tmp1 = (opus_int32)1 << 16;                                                                             /* Q16 */
367         for( k = 0; k < D; k++ ) {
368             Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );                                       /* Q16 */
369             nrg  = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 );                                         /* Q( -rshifts ) */
370             tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 );                                               /* Q16 */
371             A_Q16[ k ] = -Atmp1;
372         }
373         *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
374         *res_nrg_Q = -rshifts;
375     }
376 
377 #ifdef OPUS_CHECK_ASM
378     {
379         opus_int32 res_nrg_c = 0;
380         opus_int res_nrg_Q_c = 0;
381         opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};
382 
383         silk_burg_modified_c(
384             &res_nrg_c,
385             &res_nrg_Q_c,
386             A_Q16_c,
387             x,
388             minInvGain_Q30,
389             subfr_length,
390             nb_subfr,
391             D,
392             0
393         );
394 
395         silk_assert( *res_nrg == res_nrg_c );
396         silk_assert( *res_nrg_Q == res_nrg_Q_c );
397         silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );
398     }
399 #endif
400 }
401