1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 // Contains a function for WebRtcIsacfix_AllpassFilter2FixDec16Neon()
12 // in iSAC codec, optimized for ARM Neon platform. Bit exact with function
13 // WebRtcIsacfix_AllpassFilter2FixDec16C() in filterbanks.c. Prototype
14 // C code is at end of this file.
15
16 #include <arm_neon.h>
17 #include <assert.h>
18
WebRtcIsacfix_AllpassFilter2FixDec16Neon(int16_t * data_ch1,int16_t * data_ch2,const int16_t * factor_ch1,const int16_t * factor_ch2,const int length,int32_t * filter_state_ch1,int32_t * filter_state_ch2)19 void WebRtcIsacfix_AllpassFilter2FixDec16Neon(
20 int16_t* data_ch1, // Input and output in channel 1, in Q0
21 int16_t* data_ch2, // Input and output in channel 2, in Q0
22 const int16_t* factor_ch1, // Scaling factor for channel 1, in Q15
23 const int16_t* factor_ch2, // Scaling factor for channel 2, in Q15
24 const int length, // Length of the data buffers
25 int32_t* filter_state_ch1, // Filter state for channel 1, in Q16
26 int32_t* filter_state_ch2) { // Filter state for channel 2, in Q16
27 assert(length % 2 == 0);
28 int n = 0;
29 int16x4_t factorv;
30 int16x4_t datav;
31 int32x4_t statev;
32
33 // Load factor_ch1 and factor_ch2.
34 factorv = vld1_dup_s16(factor_ch1);
35 factorv = vld1_lane_s16(factor_ch1 + 1, factorv, 1);
36 factorv = vld1_lane_s16(factor_ch2, factorv, 2);
37 factorv = vld1_lane_s16(factor_ch2 + 1, factorv, 3);
38
39 // Load filter_state_ch1[0] and filter_state_ch2[0].
40 statev = vld1q_dup_s32(filter_state_ch1);
41 statev = vld1q_lane_s32(filter_state_ch2, statev, 2);
42
43 // Loop unrolling preprocessing.
44 int32x4_t a;
45 int16x4_t tmp1, tmp2;
46
47 // Load data_ch1[0] and data_ch2[0].
48 datav = vld1_dup_s16(data_ch1);
49 datav = vld1_lane_s16(data_ch2, datav, 2);
50
51 a = vqdmlal_s16(statev, datav, factorv);
52 tmp1 = vshrn_n_s32(a, 16);
53
54 // Update filter_state_ch1[0] and filter_state_ch2[0].
55 statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv);
56
57 // Load filter_state_ch1[1] and filter_state_ch2[1].
58 statev = vld1q_lane_s32(filter_state_ch1 + 1, statev, 1);
59 statev = vld1q_lane_s32(filter_state_ch2 + 1, statev, 3);
60
61 // Load data_ch1[1] and data_ch2[1].
62 tmp1 = vld1_lane_s16(data_ch1 + 1, tmp1, 1);
63 tmp1 = vld1_lane_s16(data_ch2 + 1, tmp1, 3);
64 datav = vrev32_s16(tmp1);
65
66 // Loop unrolling processing.
67 for (n = 0; n < length - 2; n += 2) {
68 a = vqdmlal_s16(statev, datav, factorv);
69 tmp1 = vshrn_n_s32(a, 16);
70 // Store data_ch1[n] and data_ch2[n].
71 vst1_lane_s16(data_ch1 + n, tmp1, 1);
72 vst1_lane_s16(data_ch2 + n, tmp1, 3);
73
74 // Update filter_state_ch1[0], filter_state_ch1[1]
75 // and filter_state_ch2[0], filter_state_ch2[1].
76 statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv);
77
78 // Load data_ch1[n + 2] and data_ch2[n + 2].
79 tmp1 = vld1_lane_s16(data_ch1 + n + 2, tmp1, 1);
80 tmp1 = vld1_lane_s16(data_ch2 + n + 2, tmp1, 3);
81 datav = vrev32_s16(tmp1);
82
83 a = vqdmlal_s16(statev, datav, factorv);
84 tmp2 = vshrn_n_s32(a, 16);
85 // Store data_ch1[n + 1] and data_ch2[n + 1].
86 vst1_lane_s16(data_ch1 + n + 1, tmp2, 1);
87 vst1_lane_s16(data_ch2 + n + 1, tmp2, 3);
88
89 // Update filter_state_ch1[0], filter_state_ch1[1]
90 // and filter_state_ch2[0], filter_state_ch2[1].
91 statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp2, factorv);
92
93 // Load data_ch1[n + 3] and data_ch2[n + 3].
94 tmp2 = vld1_lane_s16(data_ch1 + n + 3, tmp2, 1);
95 tmp2 = vld1_lane_s16(data_ch2 + n + 3, tmp2, 3);
96 datav = vrev32_s16(tmp2);
97 }
98
99 // Loop unrolling post-processing.
100 a = vqdmlal_s16(statev, datav, factorv);
101 tmp1 = vshrn_n_s32(a, 16);
102 // Store data_ch1[n] and data_ch2[n].
103 vst1_lane_s16(data_ch1 + n, tmp1, 1);
104 vst1_lane_s16(data_ch2 + n, tmp1, 3);
105
106 // Update filter_state_ch1[0], filter_state_ch1[1]
107 // and filter_state_ch2[0], filter_state_ch2[1].
108 statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp1, factorv);
109 // Store filter_state_ch1[0] and filter_state_ch2[0].
110 vst1q_lane_s32(filter_state_ch1, statev, 0);
111 vst1q_lane_s32(filter_state_ch2, statev, 2);
112
113 datav = vrev32_s16(tmp1);
114 a = vqdmlal_s16(statev, datav, factorv);
115 tmp2 = vshrn_n_s32(a, 16);
116 // Store data_ch1[n + 1] and data_ch2[n + 1].
117 vst1_lane_s16(data_ch1 + n + 1, tmp2, 1);
118 vst1_lane_s16(data_ch2 + n + 1, tmp2, 3);
119
120 // Update filter_state_ch1[1] and filter_state_ch2[1].
121 statev = vqdmlsl_s16(vshll_n_s16(datav, 16), tmp2, factorv);
122 // Store filter_state_ch1[1] and filter_state_ch2[1].
123 vst1q_lane_s32(filter_state_ch1 + 1, statev, 1);
124 vst1q_lane_s32(filter_state_ch2 + 1, statev, 3);
125 }
126
127 // This function is the prototype for above neon optimized function.
128 //void AllpassFilter2FixDec16BothChannels(
129 // int16_t *data_ch1, // Input and output in channel 1, in Q0
130 // int16_t *data_ch2, // Input and output in channel 2, in Q0
131 // const int16_t *factor_ch1, // Scaling factor for channel 1, in Q15
132 // const int16_t *factor_ch2, // Scaling factor for channel 2, in Q15
133 // const int length, // Length of the data buffers
134 // int32_t *filter_state_ch1, // Filter state for channel 1, in Q16
135 // int32_t *filter_state_ch2) { // Filter state for channel 2, in Q16
136 // int n = 0;
137 // int32_t state0_ch1 = filter_state_ch1[0], state1_ch1 = filter_state_ch1[1];
138 // int32_t state0_ch2 = filter_state_ch2[0], state1_ch2 = filter_state_ch2[1];
139 // int16_t sample0_ch1 = 0, sample0_ch2 = 0;
140 // int16_t sample1_ch1 = 0, sample1_ch2 = 0;
141 // int32_t a0_ch1 = 0, a0_ch2 = 0;
142 // int32_t b0_ch1 = 0, b0_ch2 = 0;
143 //
144 // int32_t a1_ch1 = 0, a1_ch2 = 0;
145 // int32_t b1_ch1 = 0, b1_ch2 = 0;
146 // int32_t b2_ch1 = 0, b2_ch2 = 0;
147 //
148 // // Loop unrolling preprocessing.
149 //
150 // sample0_ch1 = data_ch1[n];
151 // sample0_ch2 = data_ch2[n];
152 //
153 // a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
154 // a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
155 //
156 // b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
157 // b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
158 //
159 // a0_ch1 = -factor_ch1[0] * (int16_t)(b0_ch1 >> 16);
160 // a0_ch2 = -factor_ch2[0] * (int16_t)(b0_ch2 >> 16);
161 //
162 // state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1 <<1, (uint32_t)sample0_ch1 << 16);
163 // state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2 <<1, (uint32_t)sample0_ch2 << 16);
164 //
165 // sample1_ch1 = data_ch1[n + 1];
166 // sample0_ch1 = (int16_t) (b0_ch1 >> 16); //Save as Q0
167 // sample1_ch2 = data_ch2[n + 1];
168 // sample0_ch2 = (int16_t) (b0_ch2 >> 16); //Save as Q0
169 //
170 //
171 // for (n = 0; n < length - 2; n += 2) {
172 // a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
173 // a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
174 // a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
175 // a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
176 //
177 // b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
178 // b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1); //Q16+Q16=Q16
179 // b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2); //Q16+Q16=Q16
180 // b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2); //Q16+Q16=Q16
181 //
182 // a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
183 // a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
184 // a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
185 // a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
186 //
187 // state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 <<16);
188 // state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 <<16);
189 // state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 <<16);
190 // state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 <<16);
191 //
192 // sample0_ch1 = data_ch1[n + 2];
193 // sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
194 // sample0_ch2 = data_ch2[n + 2];
195 // sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0
196 //
197 // a0_ch1 = (factor_ch1[0] * sample0_ch1) << 1;
198 // a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
199 // a0_ch2 = (factor_ch2[0] * sample0_ch2) << 1;
200 // a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
201 //
202 // b2_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state0_ch1);
203 // b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
204 // b2_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state0_ch2); //Q16+Q16=Q16
205 // b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
206 //
207 // a0_ch1 = -factor_ch1[0] * (int16_t)(b2_ch1 >> 16);
208 // a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
209 // a0_ch2 = -factor_ch2[0] * (int16_t)(b2_ch2 >> 16);
210 // a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
211 //
212 // state0_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1<<16);
213 // state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
214 // state0_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2<<16);
215 // state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
216 //
217 //
218 // sample1_ch1 = data_ch1[n + 3];
219 // sample0_ch1 = (int16_t) (b2_ch1 >> 16); //Save as Q0
220 // sample1_ch2 = data_ch2[n + 3];
221 // sample0_ch2 = (int16_t) (b2_ch2 >> 16); //Save as Q0
222 //
223 // data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
224 // data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
225 // data_ch2[n] = (int16_t) (b0_ch2 >> 16);
226 // data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
227 // }
228 //
229 // // Loop unrolling post-processing.
230 //
231 // a1_ch1 = (factor_ch1[0] * sample1_ch1) << 1;
232 // a0_ch1 = (factor_ch1[1] * sample0_ch1) << 1;
233 // a1_ch2 = (factor_ch2[0] * sample1_ch2) << 1;
234 // a0_ch2 = (factor_ch2[1] * sample0_ch2) << 1;
235 //
236 // b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state0_ch1);
237 // b0_ch1 = WebRtcSpl_AddSatW32(a0_ch1, state1_ch1);
238 // b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state0_ch2);
239 // b0_ch2 = WebRtcSpl_AddSatW32(a0_ch2, state1_ch2);
240 //
241 // a1_ch1 = -factor_ch1[0] * (int16_t)(b1_ch1 >> 16);
242 // a0_ch1 = -factor_ch1[1] * (int16_t)(b0_ch1 >> 16);
243 // a1_ch2 = -factor_ch2[0] * (int16_t)(b1_ch2 >> 16);
244 // a0_ch2 = -factor_ch2[1] * (int16_t)(b0_ch2 >> 16);
245 //
246 // state0_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1 << 16);
247 // state1_ch1 = WebRtcSpl_AddSatW32(a0_ch1<<1, (uint32_t)sample0_ch1 << 16);
248 // state0_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2 << 16);
249 // state1_ch2 = WebRtcSpl_AddSatW32(a0_ch2<<1, (uint32_t)sample0_ch2 << 16);
250 //
251 // data_ch1[n] = (int16_t) (b0_ch1 >> 16); //Save as Q0
252 // data_ch2[n] = (int16_t) (b0_ch2 >> 16);
253 //
254 // sample1_ch1 = (int16_t) (b1_ch1 >> 16); //Save as Q0
255 // sample1_ch2 = (int16_t) (b1_ch2 >> 16); //Save as Q0
256 //
257 // a1_ch1 = (factor_ch1[1] * sample1_ch1) << 1;
258 // a1_ch2 = (factor_ch2[1] * sample1_ch2) << 1;
259 //
260 // b1_ch1 = WebRtcSpl_AddSatW32(a1_ch1, state1_ch1); //Q16+Q16=Q16
261 // b1_ch2 = WebRtcSpl_AddSatW32(a1_ch2, state1_ch2); //Q16+Q16=Q16
262 //
263 // a1_ch1 = -factor_ch1[1] * (int16_t)(b1_ch1 >> 16);
264 // a1_ch2 = -factor_ch2[1] * (int16_t)(b1_ch2 >> 16);
265 //
266 // state1_ch1 = WebRtcSpl_AddSatW32(a1_ch1<<1, (uint32_t)sample1_ch1<<16);
267 // state1_ch2 = WebRtcSpl_AddSatW32(a1_ch2<<1, (uint32_t)sample1_ch2<<16);
268 //
269 // data_ch1[n + 1] = (int16_t) (b1_ch1 >> 16); //Save as Q0
270 // data_ch2[n + 1] = (int16_t) (b1_ch2 >> 16);
271 //
272 // filter_state_ch1[0] = state0_ch1;
273 // filter_state_ch1[1] = state1_ch1;
274 // filter_state_ch2[0] = state0_ch2;
275 // filter_state_ch2[1] = state1_ch2;
276 //}
277