1 /* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
34 #include <smmintrin.h>
35 #include "main.h"
36 #include "celt/x86/x86cpu.h"
37
38 #include "stack_alloc.h"
39
40 typedef struct {
41 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];
42 opus_int32 RandState[ DECISION_DELAY ];
43 opus_int32 Q_Q10[ DECISION_DELAY ];
44 opus_int32 Xq_Q14[ DECISION_DELAY ];
45 opus_int32 Pred_Q15[ DECISION_DELAY ];
46 opus_int32 Shape_Q14[ DECISION_DELAY ];
47 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
48 opus_int32 LF_AR_Q14;
49 opus_int32 Seed;
50 opus_int32 SeedInit;
51 opus_int32 RD_Q10;
52 } NSQ_del_dec_struct;
53
54 typedef struct {
55 opus_int32 Q_Q10;
56 opus_int32 RD_Q10;
57 opus_int32 xq_Q14;
58 opus_int32 LF_AR_Q14;
59 opus_int32 sLTP_shp_Q14;
60 opus_int32 LPC_exc_Q14;
61 } NSQ_sample_struct;
62
63 typedef NSQ_sample_struct NSQ_sample_pair[ 2 ];
64
65 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
66 const silk_encoder_state *psEncC, /* I Encoder State */
67 silk_nsq_state *NSQ, /* I/O NSQ state */
68 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
69 const opus_int32 x_Q3[], /* I Input in Q3 */
70 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
71 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
72 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
73 opus_int subfr, /* I Subframe number */
74 opus_int nStatesDelayedDecision, /* I Number of del dec states */
75 const opus_int LTP_scale_Q14, /* I LTP state scaling */
76 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
77 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
78 const opus_int signal_type, /* I Signal type */
79 const opus_int decisionDelay /* I Decision delay */
80 );
81
82 /******************************************/
83 /* Noise shape quantizer for one subframe */
84 /******************************************/
85 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
86 silk_nsq_state *NSQ, /* I/O NSQ state */
87 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
88 opus_int signalType, /* I Signal type */
89 const opus_int32 x_Q10[], /* I */
90 opus_int8 pulses[], /* O */
91 opus_int16 xq[], /* O */
92 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
93 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
94 const opus_int16 a_Q12[], /* I Short term prediction coefs */
95 const opus_int16 b_Q14[], /* I Long term prediction coefs */
96 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
97 opus_int lag, /* I Pitch lag */
98 opus_int32 HarmShapeFIRPacked_Q14, /* I */
99 opus_int Tilt_Q14, /* I Spectral tilt */
100 opus_int32 LF_shp_Q14, /* I */
101 opus_int32 Gain_Q16, /* I */
102 opus_int Lambda_Q10, /* I */
103 opus_int offset_Q10, /* I */
104 opus_int length, /* I Input length */
105 opus_int subfr, /* I Subframe number */
106 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
107 opus_int predictLPCOrder, /* I Prediction filter order */
108 opus_int warping_Q16, /* I */
109 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
110 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
111 opus_int decisionDelay /* I */
112 );
113
silk_NSQ_del_dec_sse4_1(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,SideInfoIndices * psIndices,const opus_int32 x_Q3[],opus_int8 pulses[],const opus_int16 PredCoef_Q12[2* MAX_LPC_ORDER],const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],const opus_int16 AR2_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],const opus_int Tilt_Q14[MAX_NB_SUBFR],const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int Lambda_Q10,const opus_int LTP_scale_Q14)114 void silk_NSQ_del_dec_sse4_1(
115 const silk_encoder_state *psEncC, /* I Encoder State */
116 silk_nsq_state *NSQ, /* I/O NSQ state */
117 SideInfoIndices *psIndices, /* I/O Quantization Indices */
118 const opus_int32 x_Q3[], /* I Prefiltered input signal */
119 opus_int8 pulses[], /* O Quantized pulse signal */
120 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
121 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
122 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
123 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
124 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
125 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
126 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
127 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
128 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
129 const opus_int LTP_scale_Q14 /* I LTP state scaling */
130 )
131 {
132 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
133 opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
134 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
135 opus_int16 *pxq;
136 VARDECL( opus_int32, sLTP_Q15 );
137 VARDECL( opus_int16, sLTP );
138 opus_int32 HarmShapeFIRPacked_Q14;
139 opus_int offset_Q10;
140 opus_int32 RDmin_Q10, Gain_Q10;
141 VARDECL( opus_int32, x_sc_Q10 );
142 VARDECL( opus_int32, delayedGain_Q10 );
143 VARDECL( NSQ_del_dec_struct, psDelDec );
144 NSQ_del_dec_struct *psDD;
145 SAVE_STACK;
146
147 /* Set unvoiced lag to the previous one, overwrite later for voiced */
148 lag = NSQ->lagPrev;
149
150 silk_assert( NSQ->prev_gain_Q16 != 0 );
151
152 /* Initialize delayed decision states */
153 ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct );
154 silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) );
155 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
156 psDD = &psDelDec[ k ];
157 psDD->Seed = ( k + psIndices->Seed ) & 3;
158 psDD->SeedInit = psDD->Seed;
159 psDD->RD_Q10 = 0;
160 psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14;
161 psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
162 silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
163 silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
164 }
165
166 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
167 smpl_buf_idx = 0; /* index of oldest samples */
168
169 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
170
171 /* For voiced frames limit the decision delay to lower than the pitch lag */
172 if( psIndices->signalType == TYPE_VOICED ) {
173 for( k = 0; k < psEncC->nb_subfr; k++ ) {
174 decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 );
175 }
176 } else {
177 if( lag > 0 ) {
178 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
179 }
180 }
181
182 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
183 LSF_interpolation_flag = 0;
184 } else {
185 LSF_interpolation_flag = 1;
186 }
187
188 ALLOC( sLTP_Q15,
189 psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
190 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
191 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
192 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
193 /* Set up pointers to start of sub frame */
194 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
195 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
196 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
197 subfr = 0;
198 for( k = 0; k < psEncC->nb_subfr; k++ ) {
199 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
200 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ];
201 AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
202
203 /* Noise shape parameters */
204 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
205 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
206 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
207
208 NSQ->rewhite_flag = 0;
209 if( psIndices->signalType == TYPE_VOICED ) {
210 /* Voiced */
211 lag = pitchL[ k ];
212
213 /* Re-whitening */
214 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
215 if( k == 2 ) {
216 /* RESET DELAYED DECISIONS */
217 /* Find winner */
218 RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
219 Winner_ind = 0;
220 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
221 if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) {
222 RDmin_Q10 = psDelDec[ i ].RD_Q10;
223 Winner_ind = i;
224 }
225 }
226 for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) {
227 if( i != Winner_ind ) {
228 psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 );
229 silk_assert( psDelDec[ i ].RD_Q10 >= 0 );
230 }
231 }
232
233 /* Copy final part of signals from winner state to output and long-term filter states */
234 psDD = &psDelDec[ Winner_ind ];
235 last_smple_idx = smpl_buf_idx + decisionDelay;
236 for( i = 0; i < decisionDelay; i++ ) {
237 last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
238 if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
239 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
240 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
241 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) );
242 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
243 }
244
245 subfr = 0;
246 }
247
248 /* Rewhiten with new A coefs */
249 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
250 celt_assert( start_idx > 0 );
251
252 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
253 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
254
255 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
256 NSQ->rewhite_flag = 1;
257 }
258 }
259
260 silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k,
261 psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
262
263 silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
264 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
265 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
266 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
267
268 x_Q3 += psEncC->subfr_length;
269 pulses += psEncC->subfr_length;
270 pxq += psEncC->subfr_length;
271 }
272
273 /* Find winner */
274 RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
275 Winner_ind = 0;
276 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
277 if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) {
278 RDmin_Q10 = psDelDec[ k ].RD_Q10;
279 Winner_ind = k;
280 }
281 }
282
283 /* Copy final part of signals from winner state to output and long-term filter states */
284 psDD = &psDelDec[ Winner_ind ];
285 psIndices->Seed = psDD->SeedInit;
286 last_smple_idx = smpl_buf_idx + decisionDelay;
287 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
288 for( i = 0; i < decisionDelay; i++ ) {
289 last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
290 if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
291 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
292 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
293 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
294 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
295 }
296 silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
297 silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) );
298
299 /* Update states */
300 NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
301 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
302
303 /* Save quantized speech signal */
304 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
305 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
306 RESTORE_STACK;
307 }
308
309 /******************************************/
310 /* Noise shape quantizer for one subframe */
311 /******************************************/
silk_noise_shape_quantizer_del_dec_sse4_1(silk_nsq_state * NSQ,NSQ_del_dec_struct psDelDec[],opus_int signalType,const opus_int32 x_Q10[],opus_int8 pulses[],opus_int16 xq[],opus_int32 sLTP_Q15[],opus_int32 delayedGain_Q10[],const opus_int16 a_Q12[],const opus_int16 b_Q14[],const opus_int16 AR_shp_Q13[],opus_int lag,opus_int32 HarmShapeFIRPacked_Q14,opus_int Tilt_Q14,opus_int32 LF_shp_Q14,opus_int32 Gain_Q16,opus_int Lambda_Q10,opus_int offset_Q10,opus_int length,opus_int subfr,opus_int shapingLPCOrder,opus_int predictLPCOrder,opus_int warping_Q16,opus_int nStatesDelayedDecision,opus_int * smpl_buf_idx,opus_int decisionDelay)312 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
313 silk_nsq_state *NSQ, /* I/O NSQ state */
314 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
315 opus_int signalType, /* I Signal type */
316 const opus_int32 x_Q10[], /* I */
317 opus_int8 pulses[], /* O */
318 opus_int16 xq[], /* O */
319 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
320 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
321 const opus_int16 a_Q12[], /* I Short term prediction coefs */
322 const opus_int16 b_Q14[], /* I Long term prediction coefs */
323 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
324 opus_int lag, /* I Pitch lag */
325 opus_int32 HarmShapeFIRPacked_Q14, /* I */
326 opus_int Tilt_Q14, /* I Spectral tilt */
327 opus_int32 LF_shp_Q14, /* I */
328 opus_int32 Gain_Q16, /* I */
329 opus_int Lambda_Q10, /* I */
330 opus_int offset_Q10, /* I */
331 opus_int length, /* I Input length */
332 opus_int subfr, /* I Subframe number */
333 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
334 opus_int predictLPCOrder, /* I Prediction filter order */
335 opus_int warping_Q16, /* I */
336 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
337 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
338 opus_int decisionDelay /* I */
339 )
340 {
341 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
342 opus_int32 Winner_rand_state;
343 opus_int32 LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14;
344 opus_int32 n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10;
345 opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
346 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
347 opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
348 VARDECL( NSQ_sample_pair, psSampleState );
349 NSQ_del_dec_struct *psDD;
350 NSQ_sample_struct *psSS;
351
352 __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF;
353 __m128i b_Q12_0123, b_sr_Q12_0123;
354 SAVE_STACK;
355
356 celt_assert( nStatesDelayedDecision > 0 );
357 ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
358
359 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
360 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
361 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
362
363 a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 );
364 a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 );
365
366 if( opus_likely( predictLPCOrder == 16 ) ) {
367 a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 );
368 a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 );
369 }
370
371 if( signalType == TYPE_VOICED ){
372 b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 );
373 b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
374 }
375 for( i = 0; i < length; i++ ) {
376 /* Perform common calculations used in all states */
377
378 /* Long-term prediction */
379 if( signalType == TYPE_VOICED ) {
380 /* Unrolled loop */
381 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
382 LTP_pred_Q14 = 2;
383 {
384 __m128i tmpa, tmpb, pred_lag_ptr_tmp;
385 pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
386 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );
387 tmpa = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 );
388 tmpa = _mm_srli_si128( tmpa, 2 );
389
390 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */
391 pred_lag_ptr_tmp = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_0123 );
392 pred_lag_ptr_tmp = _mm_srli_si128( pred_lag_ptr_tmp, 2 );
393 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpa );
394
395 tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3, 2 ) );/* equal shift right 8 bytes */
396 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpb );
397 LTP_pred_Q14 += _mm_cvtsi128_si32( pred_lag_ptr_tmp );
398
399 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
400 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */
401 pred_lag_ptr++;
402 }
403 } else {
404 LTP_pred_Q14 = 0;
405 }
406
407 /* Long-term shaping */
408 if( lag > 0 ) {
409 /* Symmetric, packed FIR coefficients */
410 n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
411 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
412 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
413 shp_lag_ptr++;
414 } else {
415 n_LTP_Q14 = 0;
416 }
417 {
418 __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp;
419
420 for( k = 0; k < nStatesDelayedDecision; k++ ) {
421 /* Delayed decision state */
422 psDD = &psDelDec[ k ];
423
424 /* Sample state */
425 psSS = psSampleState[ k ];
426
427 /* Generate dither */
428 psDD->Seed = silk_RAND( psDD->Seed );
429
430 /* Pointer used in short term prediction and shaping */
431 psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
432 /* Short-term prediction */
433 silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
434 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
435 LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
436
437 tmpb = _mm_setzero_si128();
438
439 /* step 1 */
440 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
441 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); /* 0, -1, -2, -3 */
442 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 ); /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */
443
444 tmpa = _mm_srli_epi64( tmpa, 16 );
445 tmpb = _mm_add_epi32( tmpb, tmpa );
446
447 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
448 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
449 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1*-1, 3*-3 */
450 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
451 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
452
453 /* step 2 */
454 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ] ) );
455 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
456 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );
457 tmpa = _mm_srli_epi64( tmpa, 16 );
458 tmpb = _mm_add_epi32( tmpb, tmpa );
459
460 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
461 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
462 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
463 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
464 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
465
466 if ( opus_likely( predictLPCOrder == 16 ) )
467 {
468 /* step 3 */
469 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -11 ] ) );
470 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
471 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB );
472 tmpa = _mm_srli_epi64( tmpa, 16 );
473 tmpb = _mm_add_epi32( tmpb, tmpa );
474
475 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
476 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */
477 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
478 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
479 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
480
481 /* setp 4 */
482 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -15 ] ) );
483 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
484 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
485 tmpa = _mm_srli_epi64( tmpa, 16 );
486 tmpb = _mm_add_epi32( tmpb, tmpa );
487
488 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
489 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
490 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
491 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
492 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
493
494 /* add at last */
495 /* equal shift right 8 bytes*/
496 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) );
497 tmpb = _mm_add_epi32( tmpb, tmpa );
498 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb );
499 }
500 else
501 {
502 /* add at last */
503 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); /* equal shift right 8 bytes*/
504 tmpb = _mm_add_epi32( tmpb, tmpa );
505 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb );
506
507 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] );
508 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] );
509 }
510
511 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
512
513 /* Noise shape feedback */
514 silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
515 /* Output of lowpass section */
516 tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping_Q16 );
517 /* Output of allpass section */
518 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - tmp2, warping_Q16 );
519 psDD->sAR2_Q14[ 0 ] = tmp2;
520 n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
521 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
522 /* Loop over allpass sections */
523 for( j = 2; j < shapingLPCOrder; j += 2 ) {
524 /* Output of allpass section */
525 tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[ j + 0 ] - tmp1, warping_Q16 );
526 psDD->sAR2_Q14[ j - 1 ] = tmp1;
527 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
528 /* Output of allpass section */
529 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[ j + 1 ] - tmp2, warping_Q16 );
530 psDD->sAR2_Q14[ j + 0 ] = tmp2;
531 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
532 }
533 psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
534 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] );
535
536 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 ); /* Q11 -> Q12 */
537 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 ); /* Q12 */
538 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 ); /* Q12 -> Q14 */
539
540 n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 ); /* Q12 */
541 n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 ); /* Q12 */
542 n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 ); /* Q12 -> Q14 */
543
544 /* Input minus prediction plus noise feedback */
545 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
546 tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 ); /* Q14 */
547 tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */
548 tmp1 = silk_SUB32( tmp2, tmp1 ); /* Q13 */
549 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */
550
551 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */
552
553 /* Flip sign depending on dither */
554 if ( psDD->Seed < 0 ) {
555 r_Q10 = -r_Q10;
556 }
557 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
558
559 /* Find two quantization level candidates and measure their rate-distortion */
560 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
561 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
562 if( q1_Q0 > 0 ) {
563 q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
564 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
565 q2_Q10 = silk_ADD32( q1_Q10, 1024 );
566 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
567 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
568 } else if( q1_Q0 == 0 ) {
569 q1_Q10 = offset_Q10;
570 q2_Q10 = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
571 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
572 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
573 } else if( q1_Q0 == -1 ) {
574 q2_Q10 = offset_Q10;
575 q1_Q10 = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
576 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
577 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
578 } else { /* q1_Q0 < -1 */
579 q1_Q10 = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
580 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 );
581 q2_Q10 = silk_ADD32( q1_Q10, 1024 );
582 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
583 rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 );
584 }
585 rr_Q10 = silk_SUB32( r_Q10, q1_Q10 );
586 rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 );
587 rr_Q10 = silk_SUB32( r_Q10, q2_Q10 );
588 rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 );
589
590 if( rd1_Q10 < rd2_Q10 ) {
591 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
592 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
593 psSS[ 0 ].Q_Q10 = q1_Q10;
594 psSS[ 1 ].Q_Q10 = q2_Q10;
595 } else {
596 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
597 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
598 psSS[ 0 ].Q_Q10 = q2_Q10;
599 psSS[ 1 ].Q_Q10 = q1_Q10;
600 }
601
602 /* Update states for best quantization */
603
604 /* Quantized excitation */
605 exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 );
606 if ( psDD->Seed < 0 ) {
607 exc_Q14 = -exc_Q14;
608 }
609
610 /* Add predictions */
611 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
612 xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
613
614 /* Update states */
615 sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
616 psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
617 psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14;
618 psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14;
619 psSS[ 0 ].xq_Q14 = xq_Q14;
620
621 /* Update states for second best quantization */
622
623 /* Quantized excitation */
624 exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 );
625 if ( psDD->Seed < 0 ) {
626 exc_Q14 = -exc_Q14;
627 }
628
629
630 /* Add predictions */
631 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
632 xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 );
633
634 /* Update states */
635 sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 );
636 psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 );
637 psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14;
638 psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14;
639 psSS[ 1 ].xq_Q14 = xq_Q14;
640 }
641 }
642 *smpl_buf_idx = ( *smpl_buf_idx - 1 ) % DECISION_DELAY;
643 if( *smpl_buf_idx < 0 ) *smpl_buf_idx += DECISION_DELAY;
644 last_smple_idx = ( *smpl_buf_idx + decisionDelay ) % DECISION_DELAY;
645
646 /* Find winner */
647 RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
648 Winner_ind = 0;
649 for( k = 1; k < nStatesDelayedDecision; k++ ) {
650 if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) {
651 RDmin_Q10 = psSampleState[ k ][ 0 ].RD_Q10;
652 Winner_ind = k;
653 }
654 }
655
656 /* Increase RD values of expired states */
657 Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ];
658 for( k = 0; k < nStatesDelayedDecision; k++ ) {
659 if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) {
660 psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 );
661 psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 );
662 silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 );
663 }
664 }
665
666 /* Find worst in first set and best in second set */
667 RDmax_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
668 RDmin_Q10 = psSampleState[ 0 ][ 1 ].RD_Q10;
669 RDmax_ind = 0;
670 RDmin_ind = 0;
671 for( k = 1; k < nStatesDelayedDecision; k++ ) {
672 /* find worst in first set */
673 if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) {
674 RDmax_Q10 = psSampleState[ k ][ 0 ].RD_Q10;
675 RDmax_ind = k;
676 }
677 /* find best in second set */
678 if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) {
679 RDmin_Q10 = psSampleState[ k ][ 1 ].RD_Q10;
680 RDmin_ind = k;
681 }
682 }
683
684 /* Replace a state if best from second set outperforms worst in first set */
685 if( RDmin_Q10 < RDmax_Q10 ) {
686 silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i,
687 ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) );
688 silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) );
689 }
690
691 /* Write samples from winner to output and long-term filter states */
692 psDD = &psDelDec[ Winner_ind ];
693 if( subfr > 0 || i >= decisionDelay ) {
694 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
695 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
696 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
697 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ];
698 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDD->Pred_Q15[ last_smple_idx ];
699 }
700 NSQ->sLTP_shp_buf_idx++;
701 NSQ->sLTP_buf_idx++;
702
703 /* Update states */
704 for( k = 0; k < nStatesDelayedDecision; k++ ) {
705 psDD = &psDelDec[ k ];
706 psSS = &psSampleState[ k ][ 0 ];
707 psDD->LF_AR_Q14 = psSS->LF_AR_Q14;
708 psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
709 psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14;
710 psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10;
711 psDD->Pred_Q15[ *smpl_buf_idx ] = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 );
712 psDD->Shape_Q14[ *smpl_buf_idx ] = psSS->sLTP_shp_Q14;
713 psDD->Seed = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) );
714 psDD->RandState[ *smpl_buf_idx ] = psDD->Seed;
715 psDD->RD_Q10 = psSS->RD_Q10;
716 }
717 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;
718 }
719 /* Update LPC states */
720 for( k = 0; k < nStatesDelayedDecision; k++ ) {
721 psDD = &psDelDec[ k ];
722 silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
723 }
724 RESTORE_STACK;
725 }
726
silk_nsq_del_dec_scale_states_sse4_1(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,NSQ_del_dec_struct psDelDec[],const opus_int32 x_Q3[],opus_int32 x_sc_Q10[],const opus_int16 sLTP[],opus_int32 sLTP_Q15[],opus_int subfr,opus_int nStatesDelayedDecision,const opus_int LTP_scale_Q14,const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int signal_type,const opus_int decisionDelay)727 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
728 const silk_encoder_state *psEncC, /* I Encoder State */
729 silk_nsq_state *NSQ, /* I/O NSQ state */
730 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */
731 const opus_int32 x_Q3[], /* I Input in Q3 */
732 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
733 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
734 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
735 opus_int subfr, /* I Subframe number */
736 opus_int nStatesDelayedDecision, /* I Number of del dec states */
737 const opus_int LTP_scale_Q14, /* I LTP state scaling */
738 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
739 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
740 const opus_int signal_type, /* I Signal type */
741 const opus_int decisionDelay /* I Decision delay */
742 )
743 {
744 opus_int i, k, lag;
745 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
746 NSQ_del_dec_struct *psDD;
747 __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
748
749 lag = pitchL[ subfr ];
750 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
751
752 silk_assert( inv_gain_Q31 != 0 );
753
754 /* Calculate gain adjustment factor */
755 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
756 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
757 } else {
758 gain_adj_Q16 = (opus_int32)1 << 16;
759 }
760
761 /* Scale input */
762 inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
763
764 /* prepare inv_gain_Q23 in packed 4 32-bits */
765 xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
766
767 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
768 xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
769 /* equal shift right 4 bytes*/
770 xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
771
772 xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
773 xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
774
775 xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
776 xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
777
778 xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
779
780 _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 );
781 }
782
783 for( ; i < psEncC->subfr_length; i++ ) {
784 x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
785 }
786
787 /* Save inverse gain */
788 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
789
790 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
791 if( NSQ->rewhite_flag ) {
792 if( subfr == 0 ) {
793 /* Do LTP downscaling */
794 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
795 }
796 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
797 silk_assert( i < MAX_FRAME_LENGTH );
798 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
799 }
800 }
801
802 /* Adjust for changing gain */
803 if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
804 /* Scale long-term shaping state */
805 {
806 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
807
808 /* prepare gain_adj_Q16 in packed 4 32-bits */
809 xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 );
810
811 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
812 {
813 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
814 /* equal shift right 4 bytes*/
815 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
816
817 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
818 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
819
820 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
821 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
822
823 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
824
825 _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
826 }
827
828 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
829 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
830 }
831
832 /* Scale long-term prediction state */
833 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
834 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
835 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
836 }
837 }
838
839 for( k = 0; k < nStatesDelayedDecision; k++ ) {
840 psDD = &psDelDec[ k ];
841
842 /* Scale scalar states */
843 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
844
845 /* Scale short-term prediction and shaping states */
846 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
847 psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] );
848 }
849 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
850 psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] );
851 }
852 for( i = 0; i < DECISION_DELAY; i++ ) {
853 psDD->Pred_Q15[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[ i ] );
854 psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] );
855 }
856 }
857 }
858 }
859 }
860