1 /* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include <xmmintrin.h>
33 #include <emmintrin.h>
34 #include <smmintrin.h>
35 #include "main.h"
36 #include "celt/x86/x86cpu.h"
37 #include "stack_alloc.h"
38
39 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
40 const silk_encoder_state *psEncC, /* I Encoder State */
41 silk_nsq_state *NSQ, /* I/O NSQ state */
42 const opus_int32 x_Q3[], /* I input in Q3 */
43 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
44 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
45 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
46 opus_int subfr, /* I subframe number */
47 const opus_int LTP_scale_Q14, /* I */
48 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
49 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
50 const opus_int signal_type /* I Signal type */
51 );
52
53 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
54 silk_nsq_state *NSQ, /* I/O NSQ state */
55 opus_int signalType, /* I Signal type */
56 const opus_int32 x_sc_Q10[], /* I */
57 opus_int8 pulses[], /* O */
58 opus_int16 xq[], /* O */
59 opus_int32 sLTP_Q15[], /* I/O LTP state */
60 const opus_int16 a_Q12[], /* I Short term prediction coefs */
61 const opus_int16 b_Q14[], /* I Long term prediction coefs */
62 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
63 opus_int lag, /* I Pitch lag */
64 opus_int32 HarmShapeFIRPacked_Q14, /* I */
65 opus_int Tilt_Q14, /* I Spectral tilt */
66 opus_int32 LF_shp_Q14, /* I */
67 opus_int32 Gain_Q16, /* I */
68 opus_int offset_Q10, /* I */
69 opus_int length, /* I Input length */
70 opus_int32 table[][4] /* I */
71 );
72
silk_NSQ_sse4_1(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,SideInfoIndices * psIndices,const opus_int32 x_Q3[],opus_int8 pulses[],const opus_int16 PredCoef_Q12[2* MAX_LPC_ORDER],const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],const opus_int16 AR2_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],const opus_int Tilt_Q14[MAX_NB_SUBFR],const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int Lambda_Q10,const opus_int LTP_scale_Q14)73 void silk_NSQ_sse4_1(
74 const silk_encoder_state *psEncC, /* I Encoder State */
75 silk_nsq_state *NSQ, /* I/O NSQ state */
76 SideInfoIndices *psIndices, /* I/O Quantization Indices */
77 const opus_int32 x_Q3[], /* I Prefiltered input signal */
78 opus_int8 pulses[], /* O Quantized pulse signal */
79 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
80 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
81 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
82 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
83 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
84 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
85 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
86 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
87 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
88 const opus_int LTP_scale_Q14 /* I LTP state scaling */
89 )
90 {
91 opus_int k, lag, start_idx, LSF_interpolation_flag;
92 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
93 opus_int16 *pxq;
94 VARDECL( opus_int32, sLTP_Q15 );
95 VARDECL( opus_int16, sLTP );
96 opus_int32 HarmShapeFIRPacked_Q14;
97 opus_int offset_Q10;
98 VARDECL( opus_int32, x_sc_Q10 );
99
100 opus_int32 table[ 64 ][ 4 ];
101 opus_int32 tmp1;
102 opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
103
104 SAVE_STACK;
105
106 NSQ->rand_seed = psIndices->Seed;
107
108 /* Set unvoiced lag to the previous one, overwrite later for voiced */
109 lag = NSQ->lagPrev;
110
111 silk_assert( NSQ->prev_gain_Q16 != 0 );
112
113 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
114
115 /* 0 */
116 q1_Q10 = offset_Q10;
117 q2_Q10 = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
118 rd1_Q20 = q1_Q10 * Lambda_Q10;
119 rd2_Q20 = q2_Q10 * Lambda_Q10;
120
121 table[ 32 ][ 0 ] = q1_Q10;
122 table[ 32 ][ 1 ] = q2_Q10;
123 table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
124 table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
125
126 /* -1 */
127 q1_Q10 = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
128 q2_Q10 = offset_Q10;
129 rd1_Q20 = - q1_Q10 * Lambda_Q10;
130 rd2_Q20 = q2_Q10 * Lambda_Q10;
131
132 table[ 31 ][ 0 ] = q1_Q10;
133 table[ 31 ][ 1 ] = q2_Q10;
134 table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
135 table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
136
137 /* > 0 */
138 for (k = 1; k <= 31; k++)
139 {
140 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
141
142 q1_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10;
143 q2_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024;
144 rd1_Q20 = q1_Q10 * Lambda_Q10;
145 rd2_Q20 = q2_Q10 * Lambda_Q10;
146
147 table[ 32 + k ][ 0 ] = q1_Q10;
148 table[ 32 + k ][ 1 ] = q2_Q10;
149 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
150 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
151 }
152
153 /* < -1 */
154 for (k = -32; k <= -2; k++)
155 {
156 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
157
158 q1_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10;
159 q2_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024;
160 rd1_Q20 = - q1_Q10 * Lambda_Q10;
161 rd2_Q20 = - q2_Q10 * Lambda_Q10;
162
163 table[ 32 + k ][ 0 ] = q1_Q10;
164 table[ 32 + k ][ 1 ] = q2_Q10;
165 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
166 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
167 }
168
169 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
170 LSF_interpolation_flag = 0;
171 } else {
172 LSF_interpolation_flag = 1;
173 }
174
175 ALLOC( sLTP_Q15,
176 psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
177 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
178 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
179 /* Set up pointers to start of sub frame */
180 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
181 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
182 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
183 for( k = 0; k < psEncC->nb_subfr; k++ ) {
184 A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
185 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ];
186 AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
187
188 /* Noise shape parameters */
189 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
190 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
191 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
192
193 NSQ->rewhite_flag = 0;
194 if( psIndices->signalType == TYPE_VOICED ) {
195 /* Voiced */
196 lag = pitchL[ k ];
197
198 /* Re-whitening */
199 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
200 /* Rewhiten with new A coefs */
201 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
202 silk_assert( start_idx > 0 );
203
204 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
205 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
206
207 NSQ->rewhite_flag = 1;
208 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
209 }
210 }
211
212 silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
213
214 if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
215 {
216 silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
217 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
218 offset_Q10, psEncC->subfr_length, &(table[32]) );
219 }
220 else
221 {
222 silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
223 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
224 offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
225 }
226
227 x_Q3 += psEncC->subfr_length;
228 pulses += psEncC->subfr_length;
229 pxq += psEncC->subfr_length;
230 }
231
232 /* Update lagPrev for next frame */
233 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
234
235 /* Save quantized speech and noise shaping signals */
236 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
237 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
238 RESTORE_STACK;
239 }
240
241 /***********************************/
242 /* silk_noise_shape_quantizer_10_16 */
243 /***********************************/
silk_noise_shape_quantizer_10_16_sse4_1(silk_nsq_state * NSQ,opus_int signalType,const opus_int32 x_sc_Q10[],opus_int8 pulses[],opus_int16 xq[],opus_int32 sLTP_Q15[],const opus_int16 a_Q12[],const opus_int16 b_Q14[],const opus_int16 AR_shp_Q13[],opus_int lag,opus_int32 HarmShapeFIRPacked_Q14,opus_int Tilt_Q14,opus_int32 LF_shp_Q14,opus_int32 Gain_Q16,opus_int offset_Q10,opus_int length,opus_int32 table[][4])244 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
245 silk_nsq_state *NSQ, /* I/O NSQ state */
246 opus_int signalType, /* I Signal type */
247 const opus_int32 x_sc_Q10[], /* I */
248 opus_int8 pulses[], /* O */
249 opus_int16 xq[], /* O */
250 opus_int32 sLTP_Q15[], /* I/O LTP state */
251 const opus_int16 a_Q12[], /* I Short term prediction coefs */
252 const opus_int16 b_Q14[], /* I Long term prediction coefs */
253 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
254 opus_int lag, /* I Pitch lag */
255 opus_int32 HarmShapeFIRPacked_Q14, /* I */
256 opus_int Tilt_Q14, /* I Spectral tilt */
257 opus_int32 LF_shp_Q14, /* I */
258 opus_int32 Gain_Q16, /* I */
259 opus_int offset_Q10, /* I */
260 opus_int length, /* I Input length */
261 opus_int32 table[][4] /* I */
262 )
263 {
264 opus_int i;
265 opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
266 opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
267 opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
268 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
269 opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
270
271 __m128i xmm_tempa, xmm_tempb;
272
273 __m128i xmm_one;
274
275 __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF;
276 __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF;
277 __m128i a_Q12_01234567, a_Q12_89ABCDEF;
278
279 __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
280 __m128i AR_shp_Q13_76543210;
281
282 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
283 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
284 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
285
286 /* Set up short term AR state */
287 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
288
289 sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
290 xq_Q14 = psLPC_Q14[ 0 ];
291 LTP_pred_Q13 = 0;
292
293 /* load a_Q12 */
294 xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
295
296 /* load a_Q12[0] - a_Q12[7] */
297 a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) );
298 /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
299 a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) );
300
301 a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
302 a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
303
304 /* load AR_shp_Q13 */
305 AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) );
306
307 /* load psLPC_Q14 */
308 xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
309
310 xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) );
311 xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) );
312
313 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
314 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
315
316 psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
317 psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
318
319 xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) );
320 xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) );
321
322 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
323 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
324
325 psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
326 psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
327
328 /* load sAR2_Q14 */
329 xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) );
330 xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) );
331
332 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
333 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
334
335 sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
336 sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
337
338 /* prepare 1 in 8 * 16bit */
339 xmm_one = _mm_set1_epi16(1);
340
341 for( i = 0; i < length; i++ )
342 {
343 /* Short-term prediction */
344 __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F;
345
346 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
347 LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */
348
349 /* shift psLPC_Q14 */
350 psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 );
351 psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 );
352
353 psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 );
354 psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 );
355
356 psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 );
357 psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14), 7 );
358
359 /* high part, use pmaddwd, results in 4 32-bit */
360 xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 );
361 xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF );
362
363 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */
364 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 );
365 xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF );
366
367 xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 );
368 xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF );
369
370 xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 );
371 xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF );
372
373 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
374 xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb );
375
376 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
377 xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one );
378
379 /* accumulate */
380 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F );
381 xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F );
382
383 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
384
385 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
386 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
387
388 LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 );
389
390 /* Long-term prediction */
391 if ( opus_likely( signalType == TYPE_VOICED ) ) {
392 /* Unrolled loop */
393 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
394 LTP_pred_Q13 = 2;
395 {
396 __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123;
397
398 b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 );
399 b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
400
401 /* loaded: [0] [-1] [-2] [-3] */
402 pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
403 /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
404 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
405 /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
406 xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 );
407 /* right shift 2 bytes (16 bits), zero extended */
408 xmm_tempa = _mm_srli_si128( xmm_tempa, 2 );
409
410 /* a[1] * b[-1], a[3] * b[-3] */
411 pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 );
412 pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 );
413
414 pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa );
415 /* equal shift right 8 bytes*/
416 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) );
417 xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 );
418
419 LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa );
420
421 LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
422 pred_lag_ptr++;
423 }
424 }
425
426 /* Noise shape feedback */
427 NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ];
428 NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) );
429
430 sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
431 sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
432
433 sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
434 sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 );
435
436 /* high part, use pmaddwd, results in 4 32-bit */
437 xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
438
439 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */
440 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 );
441 xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 );
442
443 xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 );
444 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
445
446 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
447
448 /* accumulate */
449 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
450
451 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
452 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
453
454 n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 );
455
456 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] );
457 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] );
458
459 n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 ); /* Q11 -> Q12 */
460 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 );
461
462 n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
463 n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
464
465 silk_assert( lag > 0 || signalType != TYPE_VOICED );
466
467 /* Combine prediction and noise shaping signals */
468 tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */
469 tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */
470 if( lag > 0 ) {
471 /* Symmetric, packed FIR coefficients */
472 n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
473 n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
474 n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
475 shp_lag_ptr++;
476
477 tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 ); /* Q13 */
478 tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 ); /* Q13 */
479 tmp1 = silk_RSHIFT_ROUND( tmp1, 3 ); /* Q10 */
480 } else {
481 tmp1 = silk_RSHIFT_ROUND( tmp1, 2 ); /* Q10 */
482 }
483
484 r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 ); /* residual error Q10 */
485
486 /* Generate dither */
487 NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
488
489 /* Flip sign depending on dither */
490 tmp2 = -r_Q10;
491 if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2;
492
493 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
494
495 /* Find two quantization level candidates and measure their rate-distortion */
496 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
497 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
498
499 q1_Q10 = table[q1_Q0][0];
500 q2_Q10 = table[q1_Q0][1];
501
502 if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0)
503 {
504 q1_Q10 = q2_Q10;
505 }
506
507 pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 );
508
509 /* Excitation */
510 exc_Q14 = silk_LSHIFT( q1_Q10, 4 );
511
512 tmp2 = -exc_Q14;
513 if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2;
514
515 /* Add predictions */
516 LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
517 xq_Q14 = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
518
519 /* Update states */
520 psLPC_Q14++;
521 *psLPC_Q14 = xq_Q14;
522 sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
523
524 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
525 sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
526 NSQ->sLTP_shp_buf_idx++;
527 NSQ->sLTP_buf_idx++;
528
529 /* Make dither dependent on quantized signal */
530 NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] );
531 }
532
533 NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
534
535 /* Scale XQ back to normal level before saving */
536 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ];
537
538 /* write back sAR2_Q14 */
539 xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
540 xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
541 _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
542 _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
543
544 /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
545 {
546 __m128i xmm_Gain_Q10;
547 __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5;
548
549 /* prepare (1 << 7) in packed 4 32-bits */
550 xmm_tempa = _mm_set1_epi32( (1 << 7) );
551
552 /* prepare Gain_Q10 in packed 4 32-bits */
553 xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 );
554
555 /* process xq */
556 for (i = 0; i < length - 7; i += 8)
557 {
558 xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) );
559 xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) );
560
561 /* equal shift right 4 bytes*/
562 xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
563 /* equal shift right 4 bytes*/
564 xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) );
565
566 xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 );
567 xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 );
568 xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 );
569 xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 );
570
571 xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 );
572 xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 );
573 xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 );
574 xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 );
575
576 xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC );
577 xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC );
578
579 /* silk_RSHIFT_ROUND(xq, 8) */
580 xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa );
581 xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa );
582
583 xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 );
584 xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 );
585
586 /* silk_SAT16 */
587 xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
588
589 /* save to xq */
590 _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 );
591 }
592 }
593 for ( ; i < length; i++)
594 {
595 xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) );
596 }
597
598 /* Update LPC synth buffer */
599 silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
600 }
601
silk_nsq_scale_states_sse4_1(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,const opus_int32 x_Q3[],opus_int32 x_sc_Q10[],const opus_int16 sLTP[],opus_int32 sLTP_Q15[],opus_int subfr,const opus_int LTP_scale_Q14,const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int signal_type)602 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
603 const silk_encoder_state *psEncC, /* I Encoder State */
604 silk_nsq_state *NSQ, /* I/O NSQ state */
605 const opus_int32 x_Q3[], /* I input in Q3 */
606 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
607 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
608 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
609 opus_int subfr, /* I subframe number */
610 const opus_int LTP_scale_Q14, /* I */
611 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
612 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
613 const opus_int signal_type /* I Signal type */
614 )
615 {
616 opus_int i, lag;
617 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
618 __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
619
620 lag = pitchL[ subfr ];
621 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
622 silk_assert( inv_gain_Q31 != 0 );
623
624 /* Calculate gain adjustment factor */
625 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
626 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
627 } else {
628 gain_adj_Q16 = (opus_int32)1 << 16;
629 }
630
631 /* Scale input */
632 inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
633
634 /* prepare inv_gain_Q23 in packed 4 32-bits */
635 xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
636
637 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
638 xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
639
640 /* equal shift right 4 bytes*/
641 xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
642
643 xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
644 xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
645
646 xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
647 xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
648
649 xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
650
651 _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
652 }
653
654 for( ; i < psEncC->subfr_length; i++ ) {
655 x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
656 }
657
658 /* Save inverse gain */
659 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
660
661 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
662 if( NSQ->rewhite_flag ) {
663 if( subfr == 0 ) {
664 /* Do LTP downscaling */
665 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
666 }
667 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
668 silk_assert( i < MAX_FRAME_LENGTH );
669 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
670 }
671 }
672
673 /* Adjust for changing gain */
674 if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
675 /* Scale long-term shaping state */
676 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
677
678 /* prepare gain_adj_Q16 in packed 4 32-bits */
679 xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
680
681 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
682 {
683 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
684 /* equal shift right 4 bytes*/
685 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
686
687 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
688 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
689
690 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
691 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
692
693 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
694
695 _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
696 }
697
698 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
699 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
700 }
701
702 /* Scale long-term prediction state */
703 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
704 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
705 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
706 }
707 }
708
709 NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
710
711 /* Scale short-term prediction and shaping states */
712 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
713 NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] );
714 }
715 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
716 NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
717 }
718 }
719 }
720