1 /***********************************************************************
2 Copyright (c) 2017 Google Inc.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #include <arm_neon.h>
33 #ifdef OPUS_CHECK_ASM
34 # include <string.h>
35 #endif
36 #include "main.h"
37 #include "stack_alloc.h"
38
39 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */
40 /* If there are more states, C function is called, and this optimization must be expanded. */
41 #define NEON_MAX_DEL_DEC_STATES 4
42
43 typedef struct {
44 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ][ NEON_MAX_DEL_DEC_STATES ];
45 opus_int32 RandState[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
46 opus_int32 Q_Q10[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
47 opus_int32 Xq_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
48 opus_int32 Pred_Q15[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
49 opus_int32 Shape_Q14[ DECISION_DELAY ][ NEON_MAX_DEL_DEC_STATES ];
50 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ][ NEON_MAX_DEL_DEC_STATES ];
51 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];
52 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ];
53 opus_int32 Seed[ NEON_MAX_DEL_DEC_STATES ];
54 opus_int32 SeedInit[ NEON_MAX_DEL_DEC_STATES ];
55 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ];
56 } NSQ_del_decs_struct;
57
58 typedef struct {
59 opus_int32 Q_Q10[ NEON_MAX_DEL_DEC_STATES ];
60 opus_int32 RD_Q10[ NEON_MAX_DEL_DEC_STATES ];
61 opus_int32 xq_Q14[ NEON_MAX_DEL_DEC_STATES ];
62 opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];
63 opus_int32 Diff_Q14[ NEON_MAX_DEL_DEC_STATES ];
64 opus_int32 sLTP_shp_Q14[ NEON_MAX_DEL_DEC_STATES ];
65 opus_int32 LPC_exc_Q14[ NEON_MAX_DEL_DEC_STATES ];
66 } NSQ_samples_struct;
67
68 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
69 const silk_encoder_state *psEncC, /* I Encoder State */
70 silk_nsq_state *NSQ, /* I/O NSQ state */
71 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
72 const opus_int16 x16[], /* I Input */
73 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
74 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
75 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
76 opus_int subfr, /* I Subframe number */
77 const opus_int LTP_scale_Q14, /* I LTP state scaling */
78 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
79 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
80 const opus_int signal_type, /* I Signal type */
81 const opus_int decisionDelay /* I Decision delay */
82 );
83
84 /******************************************/
85 /* Noise shape quantizer for one subframe */
86 /******************************************/
87 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
88 silk_nsq_state *NSQ, /* I/O NSQ state */
89 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
90 opus_int signalType, /* I Signal type */
91 const opus_int32 x_Q10[], /* I */
92 opus_int8 pulses[], /* O */
93 opus_int16 xq[], /* O */
94 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
95 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
96 const opus_int16 a_Q12[], /* I Short term prediction coefs */
97 const opus_int16 b_Q14[], /* I Long term prediction coefs */
98 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
99 opus_int lag, /* I Pitch lag */
100 opus_int32 HarmShapeFIRPacked_Q14, /* I */
101 opus_int Tilt_Q14, /* I Spectral tilt */
102 opus_int32 LF_shp_Q14, /* I */
103 opus_int32 Gain_Q16, /* I */
104 opus_int Lambda_Q10, /* I */
105 opus_int offset_Q10, /* I */
106 opus_int length, /* I Input length */
107 opus_int subfr, /* I Subframe number */
108 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
109 opus_int predictLPCOrder, /* I Prediction filter order */
110 opus_int warping_Q16, /* I */
111 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
112 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
113 opus_int decisionDelay /* I */
114 );
115
copy_winner_state_kernel(const NSQ_del_decs_struct * psDelDec,const opus_int offset,const opus_int last_smple_idx,const opus_int Winner_ind,const int32x2_t gain_lo_s32x2,const int32x2_t gain_hi_s32x2,const int32x4_t shift_s32x4,int32x4_t t0_s32x4,int32x4_t t1_s32x4,opus_int8 * const pulses,opus_int16 * pxq,silk_nsq_state * NSQ)116 static OPUS_INLINE void copy_winner_state_kernel(
117 const NSQ_del_decs_struct *psDelDec,
118 const opus_int offset,
119 const opus_int last_smple_idx,
120 const opus_int Winner_ind,
121 const int32x2_t gain_lo_s32x2,
122 const int32x2_t gain_hi_s32x2,
123 const int32x4_t shift_s32x4,
124 int32x4_t t0_s32x4,
125 int32x4_t t1_s32x4,
126 opus_int8 *const pulses,
127 opus_int16 *pxq,
128 silk_nsq_state *NSQ
129 )
130 {
131 int16x8_t t_s16x8;
132 int32x4_t o0_s32x4, o1_s32x4;
133
134 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
135 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
136 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
137 t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
138 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
139 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
140 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
141 t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
142 t_s16x8 = vcombine_s16( vrshrn_n_s32( t0_s32x4, 10 ), vrshrn_n_s32( t1_s32x4, 10 ) );
143 vst1_s8( &pulses[ offset ], vmovn_s16( t_s16x8 ) );
144
145 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
146 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
147 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
148 t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
149 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
150 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
151 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
152 t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
153 o0_s32x4 = vqdmulhq_lane_s32( t0_s32x4, gain_lo_s32x2, 0 );
154 o1_s32x4 = vqdmulhq_lane_s32( t1_s32x4, gain_lo_s32x2, 0 );
155 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, t0_s32x4, gain_hi_s32x2, 0 );
156 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, t1_s32x4, gain_hi_s32x2, 0 );
157 o0_s32x4 = vrshlq_s32( o0_s32x4, shift_s32x4 );
158 o1_s32x4 = vrshlq_s32( o1_s32x4, shift_s32x4 );
159 vst1_s16( &pxq[ offset + 0 ], vqmovn_s32( o0_s32x4 ) );
160 vst1_s16( &pxq[ offset + 4 ], vqmovn_s32( o1_s32x4 ) );
161
162 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
163 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
164 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
165 t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
166 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
167 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
168 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
169 t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
170 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 0 ], t0_s32x4 );
171 vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 4 ], t1_s32x4 );
172 }
173
copy_winner_state(const NSQ_del_decs_struct * psDelDec,const opus_int decisionDelay,const opus_int smpl_buf_idx,const opus_int Winner_ind,const opus_int32 gain,const opus_int32 shift,opus_int8 * const pulses,opus_int16 * pxq,silk_nsq_state * NSQ)174 static OPUS_INLINE void copy_winner_state(
175 const NSQ_del_decs_struct *psDelDec,
176 const opus_int decisionDelay,
177 const opus_int smpl_buf_idx,
178 const opus_int Winner_ind,
179 const opus_int32 gain,
180 const opus_int32 shift,
181 opus_int8 *const pulses,
182 opus_int16 *pxq,
183 silk_nsq_state *NSQ
184 )
185 {
186 opus_int i, last_smple_idx;
187 const int32x2_t gain_lo_s32x2 = vdup_n_s32( silk_LSHIFT32( gain & 0x0000FFFF, 15 ) );
188 const int32x2_t gain_hi_s32x2 = vdup_n_s32( gain >> 16 );
189 const int32x4_t shift_s32x4 = vdupq_n_s32( -shift );
190 int32x4_t t0_s32x4, t1_s32x4;
191
192 t0_s32x4 = t1_s32x4 = vdupq_n_s32( 0 ); /* initialization */
193 last_smple_idx = smpl_buf_idx + decisionDelay - 1 + DECISION_DELAY;
194 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
195 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
196
197 for( i = 0; ( i < ( decisionDelay - 7 ) ) && ( last_smple_idx >= 7 ); i += 8, last_smple_idx -= 8 ) {
198 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
199 }
200 for( ; ( i < decisionDelay ) && ( last_smple_idx >= 0 ); i++, last_smple_idx-- ) {
201 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
202 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
203 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
204 }
205
206 last_smple_idx += DECISION_DELAY;
207 for( ; i < ( decisionDelay - 7 ); i++, last_smple_idx-- ) {
208 copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
209 }
210 for( ; i < decisionDelay; i++, last_smple_idx-- ) {
211 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
212 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
213 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
214 }
215 }
216
silk_NSQ_del_dec_neon(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,SideInfoIndices * psIndices,const opus_int16 x16[],opus_int8 pulses[],const opus_int16 PredCoef_Q12[2* MAX_LPC_ORDER],const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER],const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],const opus_int Tilt_Q14[MAX_NB_SUBFR],const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int Lambda_Q10,const opus_int LTP_scale_Q14)217 void silk_NSQ_del_dec_neon(
218 const silk_encoder_state *psEncC, /* I Encoder State */
219 silk_nsq_state *NSQ, /* I/O NSQ state */
220 SideInfoIndices *psIndices, /* I/O Quantization Indices */
221 const opus_int16 x16[], /* I Input */
222 opus_int8 pulses[], /* O Quantized pulse signal */
223 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
224 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
225 const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
226 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
227 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
228 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
229 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
230 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
231 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
232 const opus_int LTP_scale_Q14 /* I LTP state scaling */
233 )
234 {
235 #ifdef OPUS_CHECK_ASM
236 silk_nsq_state NSQ_c;
237 SideInfoIndices psIndices_c;
238 opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
239 const opus_int8 *const pulses_a = pulses;
240
241 ( void )pulses_a;
242 silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
243 silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
244 silk_memcpy( pulses_c, pulses, sizeof( pulses_c ) );
245 silk_NSQ_del_dec_c( psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
246 pitchL, Lambda_Q10, LTP_scale_Q14 );
247 #endif
248
249 /* The optimization parallelizes the different delay decision states. */
250 if(( psEncC->nStatesDelayedDecision > NEON_MAX_DEL_DEC_STATES ) || ( psEncC->nStatesDelayedDecision <= 2 )) {
251 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */
252 /* If there are more states, C function is called, and this optimization must be expanded. */
253 /* When the number of delay decision states is less than 3, there are penalties using this */
254 /* optimization, and C function is called. */
255 /* When the number of delay decision states is 2, it's better to specialize another */
256 /* structure NSQ_del_dec2_struct and optimize with shorter NEON registers. (Low priority) */
257 silk_NSQ_del_dec_c( psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14,
258 Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14 );
259 } else {
260 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
261 opus_int smpl_buf_idx, decisionDelay;
262 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
263 opus_int16 *pxq;
264 VARDECL( opus_int32, sLTP_Q15 );
265 VARDECL( opus_int16, sLTP );
266 opus_int32 HarmShapeFIRPacked_Q14;
267 opus_int offset_Q10;
268 opus_int32 RDmin_Q10, Gain_Q10;
269 VARDECL( opus_int32, x_sc_Q10 );
270 VARDECL( opus_int32, delayedGain_Q10 );
271 VARDECL( NSQ_del_decs_struct, psDelDec );
272 int32x4_t t_s32x4;
273 SAVE_STACK;
274
275 /* Set unvoiced lag to the previous one, overwrite later for voiced */
276 lag = NSQ->lagPrev;
277
278 silk_assert( NSQ->prev_gain_Q16 != 0 );
279
280 /* Initialize delayed decision states */
281 ALLOC( psDelDec, 1, NSQ_del_decs_struct );
282 /* Only RandState and RD_Q10 need to be initialized to 0. */
283 silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );
284 vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );
285
286 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
287 psDelDec->SeedInit[ k ] = psDelDec->Seed[ k ] = ( k + psIndices->Seed ) & 3;
288 }
289 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_dup_s32( &NSQ->sLF_AR_shp_Q14 ) );
290 vst1q_s32( psDelDec->Diff_Q14, vld1q_dup_s32( &NSQ->sDiff_shp_Q14 ) );
291 vst1q_s32( psDelDec->Shape_Q14[ 0 ], vld1q_dup_s32( &NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ] ) );
292 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
293 vst1q_s32( psDelDec->sLPC_Q14[ i ], vld1q_dup_s32( &NSQ->sLPC_Q14[ i ] ) );
294 }
295 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
296 vst1q_s32( psDelDec->sAR2_Q14[ i ], vld1q_dup_s32( &NSQ->sAR2_Q14[ i ] ) );
297 }
298
299 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
300 smpl_buf_idx = 0; /* index of oldest samples */
301
302 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
303
304 /* For voiced frames limit the decision delay to lower than the pitch lag */
305 if( psIndices->signalType == TYPE_VOICED ) {
306 opus_int pitch_min = pitchL[ 0 ];
307 for( k = 1; k < psEncC->nb_subfr; k++ ) {
308 pitch_min = silk_min_int( pitch_min, pitchL[ k ] );
309 }
310 decisionDelay = silk_min_int( decisionDelay, pitch_min - LTP_ORDER / 2 - 1 );
311 } else {
312 if( lag > 0 ) {
313 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
314 }
315 }
316
317 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
318 LSF_interpolation_flag = 0;
319 } else {
320 LSF_interpolation_flag = 1;
321 }
322
323 ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
324 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
325 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
326 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
327 /* Set up pointers to start of sub frame */
328 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
329 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
330 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
331 subfr = 0;
332 for( k = 0; k < psEncC->nb_subfr; k++ ) {
333 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
334 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ];
335 AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
336
337 /* Noise shape parameters */
338 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
339 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
340 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
341
342 NSQ->rewhite_flag = 0;
343 if( psIndices->signalType == TYPE_VOICED ) {
344 /* Voiced */
345 lag = pitchL[ k ];
346
347 /* Re-whitening */
348 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
349 if( k == 2 ) {
350 /* RESET DELAYED DECISIONS */
351 /* Find winner */
352 int32x4_t RD_Q10_s32x4;
353 RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
354 Winner_ind = 0;
355 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
356 if( psDelDec->RD_Q10[ i ] < RDmin_Q10 ) {
357 RDmin_Q10 = psDelDec->RD_Q10[ i ];
358 Winner_ind = i;
359 }
360 }
361 psDelDec->RD_Q10[ Winner_ind ] -= ( silk_int32_MAX >> 4 );
362 RD_Q10_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
363 RD_Q10_s32x4 = vaddq_s32( RD_Q10_s32x4, vdupq_n_s32( silk_int32_MAX >> 4 ) );
364 vst1q_s32( psDelDec->RD_Q10, RD_Q10_s32x4 );
365
366 /* Copy final part of signals from winner state to output and long-term filter states */
367 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gains_Q16[ 1 ], 14, pulses, pxq, NSQ );
368
369 subfr = 0;
370 }
371
372 /* Rewhiten with new A coefs */
373 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
374 silk_assert( start_idx > 0 );
375
376 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
377 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
378
379 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
380 NSQ->rewhite_flag = 1;
381 }
382 }
383
384 silk_nsq_del_dec_scale_states_neon( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
385 LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
386
387 silk_noise_shape_quantizer_del_dec_neon( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
388 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
389 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
390 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
391
392 x16 += psEncC->subfr_length;
393 pulses += psEncC->subfr_length;
394 pxq += psEncC->subfr_length;
395 }
396
397 /* Find winner */
398 RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
399 Winner_ind = 0;
400 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
401 if( psDelDec->RD_Q10[ k ] < RDmin_Q10 ) {
402 RDmin_Q10 = psDelDec->RD_Q10[ k ];
403 Winner_ind = k;
404 }
405 }
406
407 /* Copy final part of signals from winner state to output and long-term filter states */
408 psIndices->Seed = psDelDec->SeedInit[ Winner_ind ];
409 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
410 copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gain_Q10, 8, pulses, pxq, NSQ );
411
412 t_s32x4 = vdupq_n_s32( 0 ); /* initialization */
413 for( i = 0; i < ( NSQ_LPC_BUF_LENGTH - 3 ); i += 4 ) {
414 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
415 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
416 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
417 t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
418 vst1q_s32( &NSQ->sLPC_Q14[ i ], t_s32x4 );
419 }
420
421 for( ; i < NSQ_LPC_BUF_LENGTH; i++ ) {
422 NSQ->sLPC_Q14[ i ] = psDelDec->sLPC_Q14[ i ][ Winner_ind ];
423 }
424
425 for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) - 3 ); i += 4 ) {
426 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
427 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
428 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
429 t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
430 vst1q_s32( &NSQ->sAR2_Q14[ i ], t_s32x4 );
431 }
432
433 for( ; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
434 NSQ->sAR2_Q14[ i ] = psDelDec->sAR2_Q14[ i ][ Winner_ind ];
435 }
436
437 /* Update states */
438 NSQ->sLF_AR_shp_Q14 = psDelDec->LF_AR_Q14[ Winner_ind ];
439 NSQ->sDiff_shp_Q14 = psDelDec->Diff_Q14[ Winner_ind ];
440 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
441
442 /* Save quantized speech signal */
443 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
444 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
445 RESTORE_STACK;
446 }
447
448 #ifdef OPUS_CHECK_ASM
449 silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
450 silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
451 silk_assert( !memcmp( pulses_c, pulses_a, sizeof( pulses_c ) ) );
452 #endif
453 }
454
455 /******************************************/
456 /* Noise shape quantizer for one subframe */
457 /******************************************/
458 /* Note: Function silk_short_prediction_create_arch_coef_neon() defined in NSQ_neon.h is actually a hacking C function. */
459 /* Therefore here we append "_local" to the NEON function name to avoid confusion. */
silk_short_prediction_create_arch_coef_neon_local(opus_int32 * out,const opus_int16 * in,opus_int order)460 static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon_local(opus_int32 *out, const opus_int16 *in, opus_int order)
461 {
462 int16x8_t t_s16x8;
463 int32x4_t t0_s32x4, t1_s32x4, t2_s32x4, t3_s32x4;
464 silk_assert( order == 10 || order == 16 );
465
466 t_s16x8 = vld1q_s16( in + 0 ); /* 7 6 5 4 3 2 1 0 */
467 t_s16x8 = vrev64q_s16( t_s16x8 ); /* 4 5 6 7 0 1 2 3 */
468 t2_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* 4 5 6 7 */
469 t3_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 0 1 2 3 */
470
471 if( order == 16 ) {
472 t_s16x8 = vld1q_s16( in + 8 ); /* F E D C B A 9 8 */
473 t_s16x8 = vrev64q_s16( t_s16x8 ); /* C D E F 8 9 A B */
474 t0_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ); /* C D E F */
475 t1_s32x4 = vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ); /* 8 9 A B */
476 } else {
477 int16x4_t t_s16x4;
478
479 t0_s32x4 = vdupq_n_s32( 0 ); /* zero zero zero zero */
480 t_s16x4 = vld1_s16( in + 6 ); /* 9 8 7 6 */
481 t_s16x4 = vrev64_s16( t_s16x4 ); /* 6 7 8 9 */
482 t1_s32x4 = vshll_n_s16( t_s16x4, 15 );
483 t1_s32x4 = vcombine_s32( vget_low_s32(t0_s32x4), vget_low_s32( t1_s32x4 ) ); /* 8 9 zero zero */
484 }
485 vst1q_s32( out + 0, t0_s32x4 );
486 vst1q_s32( out + 4, t1_s32x4 );
487 vst1q_s32( out + 8, t2_s32x4 );
488 vst1q_s32( out + 12, t3_s32x4 );
489 }
490
silk_SMLAWB_lane0_neon(const int32x4_t out_s32x4,const int32x4_t in_s32x4,const int32x2_t coef_s32x2)491 static OPUS_INLINE int32x4_t silk_SMLAWB_lane0_neon(
492 const int32x4_t out_s32x4,
493 const int32x4_t in_s32x4,
494 const int32x2_t coef_s32x2
495 )
496 {
497 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 0 ) );
498 }
499
silk_SMLAWB_lane1_neon(const int32x4_t out_s32x4,const int32x4_t in_s32x4,const int32x2_t coef_s32x2)500 static OPUS_INLINE int32x4_t silk_SMLAWB_lane1_neon(
501 const int32x4_t out_s32x4,
502 const int32x4_t in_s32x4,
503 const int32x2_t coef_s32x2
504 )
505 {
506 return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 1 ) );
507 }
508
509 /* Note: This function has different return value than silk_noise_shape_quantizer_short_prediction_neon(). */
510 /* Therefore here we append "_local" to the function name to avoid confusion. */
silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 * buf32,const opus_int32 * a_Q12_arch,opus_int order)511 static OPUS_INLINE int32x4_t silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 *buf32, const opus_int32 *a_Q12_arch, opus_int order)
512 {
513 const int32x4_t a_Q12_arch0_s32x4 = vld1q_s32( a_Q12_arch + 0 );
514 const int32x4_t a_Q12_arch1_s32x4 = vld1q_s32( a_Q12_arch + 4 );
515 const int32x4_t a_Q12_arch2_s32x4 = vld1q_s32( a_Q12_arch + 8 );
516 const int32x4_t a_Q12_arch3_s32x4 = vld1q_s32( a_Q12_arch + 12 );
517 int32x4_t LPC_pred_Q14_s32x4;
518
519 silk_assert( order == 10 || order == 16 );
520 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
521 LPC_pred_Q14_s32x4 = vdupq_n_s32( silk_RSHIFT( order, 1 ) );
522 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 0 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) );
523 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 1 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch0_s32x4 ) );
524 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 2 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
525 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 3 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
526 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 4 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) );
527 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 5 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch1_s32x4 ) );
528 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 6 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
529 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 7 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
530 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 8 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) );
531 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 9 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch2_s32x4 ) );
532 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 10 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
533 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 11 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
534 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 12 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) );
535 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 13 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32( a_Q12_arch3_s32x4 ) );
536 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 14 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
537 LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 15 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
538
539 return LPC_pred_Q14_s32x4;
540 }
541
silk_noise_shape_quantizer_del_dec_neon(silk_nsq_state * NSQ,NSQ_del_decs_struct psDelDec[],opus_int signalType,const opus_int32 x_Q10[],opus_int8 pulses[],opus_int16 xq[],opus_int32 sLTP_Q15[],opus_int32 delayedGain_Q10[],const opus_int16 a_Q12[],const opus_int16 b_Q14[],const opus_int16 AR_shp_Q13[],opus_int lag,opus_int32 HarmShapeFIRPacked_Q14,opus_int Tilt_Q14,opus_int32 LF_shp_Q14,opus_int32 Gain_Q16,opus_int Lambda_Q10,opus_int offset_Q10,opus_int length,opus_int subfr,opus_int shapingLPCOrder,opus_int predictLPCOrder,opus_int warping_Q16,opus_int nStatesDelayedDecision,opus_int * smpl_buf_idx,opus_int decisionDelay)542 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
543 silk_nsq_state *NSQ, /* I/O NSQ state */
544 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
545 opus_int signalType, /* I Signal type */
546 const opus_int32 x_Q10[], /* I */
547 opus_int8 pulses[], /* O */
548 opus_int16 xq[], /* O */
549 opus_int32 sLTP_Q15[], /* I/O LTP filter state */
550 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */
551 const opus_int16 a_Q12[], /* I Short term prediction coefs */
552 const opus_int16 b_Q14[], /* I Long term prediction coefs */
553 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */
554 opus_int lag, /* I Pitch lag */
555 opus_int32 HarmShapeFIRPacked_Q14, /* I */
556 opus_int Tilt_Q14, /* I Spectral tilt */
557 opus_int32 LF_shp_Q14, /* I */
558 opus_int32 Gain_Q16, /* I */
559 opus_int Lambda_Q10, /* I */
560 opus_int offset_Q10, /* I */
561 opus_int length, /* I Input length */
562 opus_int subfr, /* I Subframe number */
563 opus_int shapingLPCOrder, /* I Shaping LPC filter order */
564 opus_int predictLPCOrder, /* I Prediction filter order */
565 opus_int warping_Q16, /* I */
566 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */
567 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */
568 opus_int decisionDelay /* I */
569 )
570 {
571 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
572 opus_int32 Winner_rand_state;
573 opus_int32 LTP_pred_Q14, n_LTP_Q14;
574 opus_int32 RDmin_Q10, RDmax_Q10;
575 opus_int32 Gain_Q10;
576 opus_int32 *pred_lag_ptr, *shp_lag_ptr;
577 opus_int32 a_Q12_arch[MAX_LPC_ORDER];
578 const int32x2_t warping_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( warping_Q16, 16 ) >> 1 );
579 const opus_int32 LF_shp_Q29 = silk_LSHIFT32( LF_shp_Q14, 16 ) >> 1;
580 opus_int32 AR_shp_Q28[ MAX_SHAPE_LPC_ORDER ];
581 const uint32x4_t rand_multiplier_u32x4 = vdupq_n_u32( RAND_MULTIPLIER );
582 const uint32x4_t rand_increment_u32x4 = vdupq_n_u32( RAND_INCREMENT );
583
584 VARDECL( NSQ_samples_struct, psSampleState );
585 SAVE_STACK;
586
587 silk_assert( nStatesDelayedDecision > 0 );
588 silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
589 ALLOC( psSampleState, 2, NSQ_samples_struct );
590
591 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
592 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
593 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
594
595 for( i = 0; i < ( MAX_SHAPE_LPC_ORDER - 7 ); i += 8 ) {
596 const int16x8_t t_s16x8 = vld1q_s16( AR_shp_Q13 + i );
597 vst1q_s32( AR_shp_Q28 + i + 0, vshll_n_s16( vget_low_s16( t_s16x8 ), 15 ) );
598 vst1q_s32( AR_shp_Q28 + i + 4, vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ) );
599 }
600
601 for( ; i < MAX_SHAPE_LPC_ORDER; i++ ) {
602 AR_shp_Q28[i] = silk_LSHIFT32( AR_shp_Q13[i], 15 );
603 }
604
605 silk_short_prediction_create_arch_coef_neon_local( a_Q12_arch, a_Q12, predictLPCOrder );
606
607 for( i = 0; i < length; i++ ) {
608 int32x4_t Seed_s32x4, LPC_pred_Q14_s32x4;
609 int32x4_t sign_s32x4, tmp1_s32x4, tmp2_s32x4;
610 int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4;
611 int32x2_t AR_shp_Q28_s32x2;
612 int16x4_t r_Q10_s16x4, rr_Q10_s16x4;
613
614 /* Perform common calculations used in all states */
615
616 /* Long-term prediction */
617 if( signalType == TYPE_VOICED ) {
618 /* Unrolled loop */
619 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
620 LTP_pred_Q14 = 2;
621 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ 0 ], b_Q14[ 0 ] );
622 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] );
623 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] );
624 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] );
625 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
626 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */
627 pred_lag_ptr++;
628 } else {
629 LTP_pred_Q14 = 0;
630 }
631
632 /* Long-term shaping */
633 if( lag > 0 ) {
634 /* Symmetric, packed FIR coefficients */
635 n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
636 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
637 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */
638 shp_lag_ptr++;
639 } else {
640 n_LTP_Q14 = 0;
641 }
642
643 /* Generate dither */
644 Seed_s32x4 = vld1q_s32( psDelDec->Seed );
645 Seed_s32x4 = vreinterpretq_s32_u32( vmlaq_u32( rand_increment_u32x4, vreinterpretq_u32_s32( Seed_s32x4 ), rand_multiplier_u32x4 ) );
646 vst1q_s32( psDelDec->Seed, Seed_s32x4 );
647
648 /* Short-term prediction */
649 LPC_pred_Q14_s32x4 = silk_noise_shape_quantizer_short_prediction_neon_local(psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 16 + i ], a_Q12_arch, predictLPCOrder);
650 LPC_pred_Q14_s32x4 = vshlq_n_s32( LPC_pred_Q14_s32x4, 4 ); /* Q10 -> Q14 */
651
652 /* Noise shape feedback */
653 /* Output of lowpass section */
654 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->Diff_Q14 ), vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), warping_Q16_s32x2 );
655 /* Output of allpass section */
656 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ 1 ] ), tmp2_s32x4 );
657 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
658 vst1q_s32( psDelDec->sAR2_Q14[ 0 ], tmp2_s32x4 );
659 AR_shp_Q28_s32x2 = vld1_s32( AR_shp_Q28 );
660 n_AR_Q14_s32x4 = vaddq_s32( vdupq_n_s32( silk_RSHIFT( shapingLPCOrder, 1 ) ), vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
661
662 /* Loop over allpass sections */
663 for( j = 2; j < shapingLPCOrder; j += 2 ) {
664 /* Output of allpass section */
665 tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4 );
666 tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j - 1 ] ), tmp2_s32x4, warping_Q16_s32x2 );
667 vst1q_s32( psDelDec->sAR2_Q14[ j - 1 ], tmp1_s32x4 );
668 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
669 /* Output of allpass section */
670 tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 1 ] ), tmp2_s32x4 );
671 tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
672 vst1q_s32( psDelDec->sAR2_Q14[ j + 0 ], tmp2_s32x4 );
673 AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ j ] );
674 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
675 }
676 vst1q_s32( psDelDec->sAR2_Q14[ shapingLPCOrder - 1 ], tmp1_s32x4 );
677 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
678 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 1 ); /* Q11 -> Q12 */
679 n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( Tilt_Q14, 16 ) >> 1 ) ); /* Q12 */
680 n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 2 ); /* Q12 -> Q14 */
681 n_LF_Q14_s32x4 = vqdmulhq_n_s32( vld1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ] ), LF_shp_Q29 ); /* Q12 */
682 n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( LF_shp_Q14 >> 16 , 15 ) ) ); /* Q12 */
683 n_LF_Q14_s32x4 = vshlq_n_s32( n_LF_Q14_s32x4, 2 ); /* Q12 -> Q14 */
684
685 /* Input minus prediction plus noise feedback */
686 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */
687 tmp1_s32x4 = vaddq_s32( n_AR_Q14_s32x4, n_LF_Q14_s32x4 ); /* Q14 */
688 tmp2_s32x4 = vaddq_s32( vdupq_n_s32( n_LTP_Q14 ), LPC_pred_Q14_s32x4 ); /* Q13 */
689 tmp1_s32x4 = vsubq_s32( tmp2_s32x4, tmp1_s32x4 ); /* Q13 */
690 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 4 ); /* Q10 */
691 tmp1_s32x4 = vsubq_s32( vdupq_n_s32( x_Q10[ i ] ), tmp1_s32x4 ); /* residual error Q10 */
692
693 /* Flip sign depending on dither */
694 sign_s32x4 = vreinterpretq_s32_u32( vcltq_s32( Seed_s32x4, vdupq_n_s32( 0 ) ) );
695 tmp1_s32x4 = veorq_s32( tmp1_s32x4, sign_s32x4 );
696 tmp1_s32x4 = vsubq_s32( tmp1_s32x4, sign_s32x4 );
697 tmp1_s32x4 = vmaxq_s32( tmp1_s32x4, vdupq_n_s32( -( 31 << 10 ) ) );
698 tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) );
699 r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 );
700
701 /* Find two quantization level candidates and measure their rate-distortion */
702 {
703 int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q10 ) );
704 int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 );
705 int16x4_t q2_Q10_s16x4;
706 int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4;
707 uint32x4_t t_u32x4;
708
709 if( Lambda_Q10 > 2048 ) {
710 /* For aggressive RDO, the bias becomes more than one pulse. */
711 const int rdo_offset = Lambda_Q10/2 - 512;
712 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );
713 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );
714 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */
715 silk_assert( Lambda_Q10 <= 32767 );
716
717 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );
718 q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
719 q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
720 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );
721 }
722 {
723 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );
724 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
725 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
726 int16x4_t tmp1_s16x4, tmp2_s16x4;
727
728 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );
729 tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
730 q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) );
731 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );
732 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );
733 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );
734 q2_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( 1024 ) );
735 q2_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 + 1024 - QUANT_LEVEL_ADJUST_Q10 ), q2_Q10_s16x4 );
736 q2_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 ), q2_Q10_s16x4 );
737 tmp1_s16x4 = q1_Q10_s16x4;
738 tmp2_s16x4 = q2_Q10_s16x4;
739 tmp1_s16x4 = vbsl_s16( vorr_u16( equalMinus1_u16x4, lessThanMinus1_u16x4 ), vneg_s16( tmp1_s16x4 ), tmp1_s16x4 );
740 tmp2_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vneg_s16( tmp2_s16x4 ), tmp2_s16x4 );
741 rd1_Q10_s32x4 = vmull_s16( tmp1_s16x4, vdup_n_s16( Lambda_Q10 ) );
742 rd2_Q10_s32x4 = vmull_s16( tmp2_s16x4, vdup_n_s16( Lambda_Q10 ) );
743 }
744
745 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q1_Q10_s16x4 );
746 rd1_Q10_s32x4 = vmlal_s16( rd1_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
747 rd1_Q10_s32x4 = vshrq_n_s32( rd1_Q10_s32x4, 10 );
748
749 rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q2_Q10_s16x4 );
750 rd2_Q10_s32x4 = vmlal_s16( rd2_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
751 rd2_Q10_s32x4 = vshrq_n_s32( rd2_Q10_s32x4, 10 );
752
753 tmp2_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
754 tmp1_s32x4 = vaddq_s32( tmp2_s32x4, vminq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
755 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vmaxq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
756 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
757 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
758 t_u32x4 = vcltq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 );
759 tmp1_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q1_Q10_s16x4 ), vmovl_s16( q2_Q10_s16x4 ) );
760 tmp2_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q2_Q10_s16x4 ), vmovl_s16( q1_Q10_s16x4 ) );
761 vst1q_s32( psSampleState[ 0 ].Q_Q10, tmp1_s32x4 );
762 vst1q_s32( psSampleState[ 1 ].Q_Q10, tmp2_s32x4 );
763 }
764
765 {
766 /* Update states for best quantization */
767 int32x4_t exc_Q14_s32x4, LPC_exc_Q14_s32x4, xq_Q14_s32x4, sLF_AR_shp_Q14_s32x4;
768
769 /* Quantized excitation */
770 exc_Q14_s32x4 = vshlq_n_s32( tmp1_s32x4, 4 );
771 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
772 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
773
774 /* Add predictions */
775 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
776 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
777
778 /* Update states */
779 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
780 vst1q_s32( psSampleState[ 0 ].Diff_Q14, tmp1_s32x4 );
781 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
782 vst1q_s32( psSampleState[ 0 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
783 vst1q_s32( psSampleState[ 0 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
784 vst1q_s32( psSampleState[ 0 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
785 vst1q_s32( psSampleState[ 0 ].xq_Q14, xq_Q14_s32x4 );
786
787 /* Quantized excitation */
788 exc_Q14_s32x4 = vshlq_n_s32( tmp2_s32x4, 4 );
789 exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
790 exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
791
792 /* Add predictions */
793 LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
794 xq_Q14_s32x4 = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
795
796 /* Update states */
797 tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
798 vst1q_s32( psSampleState[ 1 ].Diff_Q14, tmp1_s32x4 );
799 sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
800 vst1q_s32( psSampleState[ 1 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
801 vst1q_s32( psSampleState[ 1 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
802 vst1q_s32( psSampleState[ 1 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
803 vst1q_s32( psSampleState[ 1 ].xq_Q14, xq_Q14_s32x4 );
804 }
805
806 *smpl_buf_idx = *smpl_buf_idx ? ( *smpl_buf_idx - 1 ) : ( DECISION_DELAY - 1);
807 last_smple_idx = *smpl_buf_idx + decisionDelay + DECISION_DELAY;
808 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
809 if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
810
811 /* Find winner */
812 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
813 Winner_ind = 0;
814 for( k = 1; k < nStatesDelayedDecision; k++ ) {
815 if( psSampleState[ 0 ].RD_Q10[ k ] < RDmin_Q10 ) {
816 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
817 Winner_ind = k;
818 }
819 }
820
821 /* Increase RD values of expired states */
822 {
823 uint32x4_t t_u32x4;
824 Winner_rand_state = psDelDec->RandState[ last_smple_idx ][ Winner_ind ];
825 t_u32x4 = vceqq_s32( vld1q_s32( psDelDec->RandState[ last_smple_idx ] ), vdupq_n_s32( Winner_rand_state ) );
826 t_u32x4 = vmvnq_u32( t_u32x4 );
827 t_u32x4 = vshrq_n_u32( t_u32x4, 5 );
828 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].RD_Q10 );
829 tmp2_s32x4 = vld1q_s32( psSampleState[ 1 ].RD_Q10 );
830 tmp1_s32x4 = vaddq_s32( tmp1_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
831 tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
832 vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
833 vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
834
835 /* Find worst in first set and best in second set */
836 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
837 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ 0 ];
838 RDmax_ind = 0;
839 RDmin_ind = 0;
840 for( k = 1; k < nStatesDelayedDecision; k++ ) {
841 /* find worst in first set */
842 if( psSampleState[ 0 ].RD_Q10[ k ] > RDmax_Q10 ) {
843 RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
844 RDmax_ind = k;
845 }
846 /* find best in second set */
847 if( psSampleState[ 1 ].RD_Q10[ k ] < RDmin_Q10 ) {
848 RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ k ];
849 RDmin_ind = k;
850 }
851 }
852 }
853
854 /* Replace a state if best from second set outperforms worst in first set */
855 if( RDmin_Q10 < RDmax_Q10 ) {
856 opus_int32 (*ptr)[NEON_MAX_DEL_DEC_STATES] = psDelDec->RandState;
857 const int numOthers = (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *)0 )->sLPC_Q14 ) )
858 / ( NEON_MAX_DEL_DEC_STATES * sizeof( opus_int32 ) ) );
859 /* Only ( predictLPCOrder - 1 ) of sLPC_Q14 buffer need to be updated, though the first several */
860 /* useless sLPC_Q14[] will be different comparing with C when predictLPCOrder < NSQ_LPC_BUF_LENGTH. */
861 /* Here just update constant ( NSQ_LPC_BUF_LENGTH - 1 ) for simplicity. */
862 for( j = i + 1; j < i + NSQ_LPC_BUF_LENGTH; j++ ) {
863 psDelDec->sLPC_Q14[ j ][ RDmax_ind ] = psDelDec->sLPC_Q14[ j ][ RDmin_ind ];
864 }
865 for( j = 0; j < numOthers; j++ ) {
866 ptr[ j ][ RDmax_ind ] = ptr[ j ][ RDmin_ind ];
867 }
868
869 psSampleState[ 0 ].Q_Q10[ RDmax_ind ] = psSampleState[ 1 ].Q_Q10[ RDmin_ind ];
870 psSampleState[ 0 ].RD_Q10[ RDmax_ind ] = psSampleState[ 1 ].RD_Q10[ RDmin_ind ];
871 psSampleState[ 0 ].xq_Q14[ RDmax_ind ] = psSampleState[ 1 ].xq_Q14[ RDmin_ind ];
872 psSampleState[ 0 ].LF_AR_Q14[ RDmax_ind ] = psSampleState[ 1 ].LF_AR_Q14[ RDmin_ind ];
873 psSampleState[ 0 ].Diff_Q14[ RDmax_ind ] = psSampleState[ 1 ].Diff_Q14[ RDmin_ind ];
874 psSampleState[ 0 ].sLTP_shp_Q14[ RDmax_ind ] = psSampleState[ 1 ].sLTP_shp_Q14[ RDmin_ind ];
875 psSampleState[ 0 ].LPC_exc_Q14[ RDmax_ind ] = psSampleState[ 1 ].LPC_exc_Q14[ RDmin_ind ];
876 }
877
878 /* Write samples from winner to output and long-term filter states */
879 if( subfr > 0 || i >= decisionDelay ) {
880 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
881 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
882 silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
883 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
884 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDelDec->Pred_Q15[ last_smple_idx ][ Winner_ind ];
885 }
886 NSQ->sLTP_shp_buf_idx++;
887 NSQ->sLTP_buf_idx++;
888
889 /* Update states */
890 vst1q_s32( psDelDec->LF_AR_Q14, vld1q_s32( psSampleState[ 0 ].LF_AR_Q14 ) );
891 vst1q_s32( psDelDec->Diff_Q14, vld1q_s32( psSampleState[ 0 ].Diff_Q14 ) );
892 vst1q_s32( psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
893 vst1q_s32( psDelDec->Xq_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
894 tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].Q_Q10 );
895 vst1q_s32( psDelDec->Q_Q10[ *smpl_buf_idx ], tmp1_s32x4 );
896 vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) );
897 vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) );
898 tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 );
899 tmp1_s32x4 = vaddq_s32( vld1q_s32( psDelDec->Seed ), tmp1_s32x4 );
900 vst1q_s32( psDelDec->Seed, tmp1_s32x4 );
901 vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 );
902 vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) );
903 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;
904 }
905 /* Update LPC states */
906 silk_memcpy( psDelDec->sLPC_Q14[ 0 ], psDelDec->sLPC_Q14[ length ], NEON_MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
907
908 RESTORE_STACK;
909 }
910
silk_SMULWB_8_neon(const opus_int16 * a,const int32x2_t b,opus_int32 * o)911 static OPUS_INLINE void silk_SMULWB_8_neon(
912 const opus_int16 *a,
913 const int32x2_t b,
914 opus_int32 *o
915 )
916 {
917 const int16x8_t a_s16x8 = vld1q_s16( a );
918 int32x4_t o0_s32x4, o1_s32x4;
919
920 o0_s32x4 = vshll_n_s16( vget_low_s16( a_s16x8 ), 15 );
921 o1_s32x4 = vshll_n_s16( vget_high_s16( a_s16x8 ), 15 );
922 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b, 0 );
923 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b, 0 );
924 vst1q_s32( o, o0_s32x4 );
925 vst1q_s32( o + 4, o1_s32x4 );
926 }
927
928 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
silk_SMULWW_small_b_4_neon(opus_int32 * a,const int32x2_t b_s32x2)929 static OPUS_INLINE void silk_SMULWW_small_b_4_neon(
930 opus_int32 *a,
931 const int32x2_t b_s32x2)
932 {
933 int32x4_t o_s32x4;
934
935 o_s32x4 = vld1q_s32( a );
936 o_s32x4 = vqdmulhq_lane_s32( o_s32x4, b_s32x2, 0 );
937 vst1q_s32( a, o_s32x4 );
938 }
939
940 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
silk_SMULWW_small_b_8_neon(opus_int32 * a,const int32x2_t b_s32x2)941 static OPUS_INLINE void silk_SMULWW_small_b_8_neon(
942 opus_int32 *a,
943 const int32x2_t b_s32x2
944 )
945 {
946 int32x4_t o0_s32x4, o1_s32x4;
947
948 o0_s32x4 = vld1q_s32( a );
949 o1_s32x4 = vld1q_s32( a + 4 );
950 o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b_s32x2, 0 );
951 o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b_s32x2, 0 );
952 vst1q_s32( a, o0_s32x4 );
953 vst1q_s32( a + 4, o1_s32x4 );
954 }
955
silk_SMULWW_4_neon(opus_int32 * a,const int32x2_t b_s32x2)956 static OPUS_INLINE void silk_SMULWW_4_neon(
957 opus_int32 *a,
958 const int32x2_t b_s32x2)
959 {
960 int32x4_t a_s32x4, o_s32x4;
961
962 a_s32x4 = vld1q_s32( a );
963 o_s32x4 = vqdmulhq_lane_s32( a_s32x4, b_s32x2, 0 );
964 o_s32x4 = vmlaq_lane_s32( o_s32x4, a_s32x4, b_s32x2, 1 );
965 vst1q_s32( a, o_s32x4 );
966 }
967
silk_SMULWW_8_neon(opus_int32 * a,const int32x2_t b_s32x2)968 static OPUS_INLINE void silk_SMULWW_8_neon(
969 opus_int32 *a,
970 const int32x2_t b_s32x2
971 )
972 {
973 int32x4_t a0_s32x4, a1_s32x4, o0_s32x4, o1_s32x4;
974
975 a0_s32x4 = vld1q_s32( a );
976 a1_s32x4 = vld1q_s32( a + 4 );
977 o0_s32x4 = vqdmulhq_lane_s32( a0_s32x4, b_s32x2, 0 );
978 o1_s32x4 = vqdmulhq_lane_s32( a1_s32x4, b_s32x2, 0 );
979 o0_s32x4 = vmlaq_lane_s32( o0_s32x4, a0_s32x4, b_s32x2, 1 );
980 o1_s32x4 = vmlaq_lane_s32( o1_s32x4, a1_s32x4, b_s32x2, 1 );
981 vst1q_s32( a, o0_s32x4 );
982 vst1q_s32( a + 4, o1_s32x4 );
983 }
984
silk_SMULWW_loop_neon(const opus_int16 * a,const opus_int32 b,opus_int32 * o,const opus_int loop_num)985 static OPUS_INLINE void silk_SMULWW_loop_neon(
986 const opus_int16 *a,
987 const opus_int32 b,
988 opus_int32 *o,
989 const opus_int loop_num
990 )
991 {
992 opus_int i;
993 int32x2_t b_s32x2;
994
995 b_s32x2 = vdup_n_s32( b );
996 for( i = 0; i < loop_num - 7; i += 8 ) {
997 silk_SMULWB_8_neon( a + i, b_s32x2, o + i );
998 }
999 for( ; i < loop_num; i++ ) {
1000 o[ i ] = silk_SMULWW( a[ i ], b );
1001 }
1002 }
1003
silk_nsq_del_dec_scale_states_neon(const silk_encoder_state * psEncC,silk_nsq_state * NSQ,NSQ_del_decs_struct psDelDec[],const opus_int16 x16[],opus_int32 x_sc_Q10[],const opus_int16 sLTP[],opus_int32 sLTP_Q15[],opus_int subfr,const opus_int LTP_scale_Q14,const opus_int32 Gains_Q16[MAX_NB_SUBFR],const opus_int pitchL[MAX_NB_SUBFR],const opus_int signal_type,const opus_int decisionDelay)1004 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
1005 const silk_encoder_state *psEncC, /* I Encoder State */
1006 silk_nsq_state *NSQ, /* I/O NSQ state */
1007 NSQ_del_decs_struct psDelDec[], /* I/O Delayed decision states */
1008 const opus_int16 x16[], /* I Input */
1009 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */
1010 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */
1011 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
1012 opus_int subfr, /* I Subframe number */
1013 const opus_int LTP_scale_Q14, /* I LTP state scaling */
1014 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
1015 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
1016 const opus_int signal_type, /* I Signal type */
1017 const opus_int decisionDelay /* I Decision delay */
1018 )
1019 {
1020 opus_int i, lag;
1021 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
1022
1023 lag = pitchL[ subfr ];
1024 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
1025 silk_assert( inv_gain_Q31 != 0 );
1026
1027 /* Scale input */
1028 inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
1029 silk_SMULWW_loop_neon( x16, inv_gain_Q26, x_sc_Q10, psEncC->subfr_length );
1030
1031 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
1032 if( NSQ->rewhite_flag ) {
1033 if( subfr == 0 ) {
1034 /* Do LTP downscaling */
1035 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
1036 }
1037 silk_SMULWW_loop_neon( sLTP + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, inv_gain_Q31, sLTP_Q15 + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, lag + LTP_ORDER / 2 );
1038 }
1039
1040 /* Adjust for changing gain */
1041 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
1042 int32x2_t gain_adj_Q16_s32x2;
1043 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
1044
1045 /* Scale long-term shaping state */
1046 if( ( gain_adj_Q16 >= -65536 ) && ( gain_adj_Q16 < 65536 ) ) {
1047 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16, 15 ) );
1048 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
1049 silk_SMULWW_small_b_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
1050 }
1051 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
1052 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
1053 }
1054
1055 /* Scale long-term prediction state */
1056 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
1057 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
1058 silk_SMULWW_small_b_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
1059 }
1060 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
1061 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
1062 }
1063 }
1064
1065 /* Scale scalar states */
1066 silk_SMULWW_small_b_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
1067 silk_SMULWW_small_b_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 );
1068
1069 /* Scale short-term prediction and shaping states */
1070 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
1071 silk_SMULWW_small_b_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
1072 }
1073
1074 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
1075 silk_SMULWW_small_b_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
1076 }
1077
1078 for( i = 0; i < DECISION_DELAY; i++ ) {
1079 silk_SMULWW_small_b_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q16_s32x2 );
1080 silk_SMULWW_small_b_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
1081 }
1082 } else {
1083 gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16 & 0x0000FFFF, 15 ) );
1084 gain_adj_Q16_s32x2 = vset_lane_s32( gain_adj_Q16 >> 16, gain_adj_Q16_s32x2, 1 );
1085 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
1086 silk_SMULWW_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
1087 }
1088 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
1089 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
1090 }
1091
1092 /* Scale long-term prediction state */
1093 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
1094 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
1095 silk_SMULWW_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
1096 }
1097 for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
1098 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
1099 }
1100 }
1101
1102 /* Scale scalar states */
1103 silk_SMULWW_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
1104 silk_SMULWW_4_neon( psDelDec->Diff_Q14, gain_adj_Q16_s32x2 );
1105
1106 /* Scale short-term prediction and shaping states */
1107 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
1108 silk_SMULWW_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
1109 }
1110
1111 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
1112 silk_SMULWW_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
1113 }
1114
1115 for( i = 0; i < DECISION_DELAY; i++ ) {
1116 silk_SMULWW_4_neon( psDelDec->Pred_Q15[ i ], gain_adj_Q16_s32x2 );
1117 silk_SMULWW_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
1118 }
1119 }
1120
1121 /* Save inverse gain */
1122 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
1123 }
1124 }
1125