1 /*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
18 #define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
19
20 namespace android {
21
22 // depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
23
24 #if USE_NEON
25
26 // use intrinsics if inline arm32 assembly is not possible
27 #if !USE_INLINE_ASSEMBLY
28 #define USE_INTRINSIC
29 #endif
30
31 // following intrinsics available only on ARM 64 bit ACLE
32 #ifndef __aarch64__
33 #undef vld1q_f32_x2
34 #undef vld1q_s32_x2
35 #endif
36
37 #define TO_STRING2(x) #x
38 #define TO_STRING(x) TO_STRING2(x)
39 // uncomment to print GCC version, may be relevant for intrinsic optimizations
40 /* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
41 "." TO_STRING(__GNUC_MINOR__) \
42 "." TO_STRING(__GNUC_PATCHLEVEL__)) */
43
44 //
45 // NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
46 //
47 // Two variants are presented here:
48 // ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32.
49 // ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header.
50 //
51
52 // Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
53 // These are only used for inline assembly.
54 #define ASSEMBLY_ACCUMULATE_MONO \
55 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\
56 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\
57 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\
58 "vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\
59 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\
60 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\
61 "vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */
62
63 #define ASSEMBLY_ACCUMULATE_STEREO \
64 "vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\
65 "vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\
66 "vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\
67 "vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\
68 "vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\
69 "vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\
70 "vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\
71 "vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/
72
73 template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(int32_t * out,int count,const int16_t * coefsP,const int16_t * coefsN,const int16_t * sP,const int16_t * sN,const int32_t * volumeLR,uint32_t lerpP,const int16_t * coefsP1,const int16_t * coefsN1)74 static inline void ProcessNeonIntrinsic(int32_t* out,
75 int count,
76 const int16_t* coefsP,
77 const int16_t* coefsN,
78 const int16_t* sP,
79 const int16_t* sN,
80 const int32_t* volumeLR,
81 uint32_t lerpP,
82 const int16_t* coefsP1,
83 const int16_t* coefsN1)
84 {
85 ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
86 static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
87
88 sP -= CHANNELS*((STRIDE>>1)-1);
89 coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16);
90 coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16);
91
92 int16x4_t interp;
93 if (!FIXED) {
94 interp = vdup_n_s16(lerpP);
95 //interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0);
96 coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16);
97 coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16);
98 }
99 int32x4_t accum, accum2;
100 // warning uninitialized if we use veorq_s32
101 // (alternative to below) accum = veorq_s32(accum, accum);
102 accum = vdupq_n_s32(0);
103 if (CHANNELS == 2) {
104 // (alternative to below) accum2 = veorq_s32(accum2, accum2);
105 accum2 = vdupq_n_s32(0);
106 }
107 do {
108 int16x8_t posCoef = vld1q_s16(coefsP);
109 coefsP += 8;
110 int16x8_t negCoef = vld1q_s16(coefsN);
111 coefsN += 8;
112 if (!FIXED) { // interpolate
113 int16x8_t posCoef1 = vld1q_s16(coefsP1);
114 coefsP1 += 8;
115 int16x8_t negCoef1 = vld1q_s16(coefsN1);
116 coefsN1 += 8;
117
118 posCoef1 = vsubq_s16(posCoef1, posCoef);
119 negCoef = vsubq_s16(negCoef, negCoef1);
120
121 posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0);
122 negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0);
123
124 posCoef = vaddq_s16(posCoef, posCoef1);
125 negCoef = vaddq_s16(negCoef, negCoef1);
126 }
127 switch (CHANNELS) {
128 case 1: {
129 int16x8_t posSamp = vld1q_s16(sP);
130 int16x8_t negSamp = vld1q_s16(sN);
131 sN += 8;
132 posSamp = vrev64q_s16(posSamp);
133
134 // dot product
135 accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed
136 accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed
137 accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef));
138 accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef));
139 sP -= 8;
140 } break;
141 case 2: {
142 int16x8x2_t posSamp = vld2q_s16(sP);
143 int16x8x2_t negSamp = vld2q_s16(sN);
144 sN += 16;
145 posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
146 posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
147
148 // dot product
149 accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r
150 accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r
151 accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r
152 accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r
153 accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef));
154 accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef));
155 accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
156 accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
157 sP -= 16;
158 } break;
159 }
160 } while (count -= 8);
161
162 // multiply by volume and save
163 volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
164 int32x2_t vLR = vld1_s32(volumeLR);
165 int32x2_t outSamp = vld1_s32(out);
166 // combine and funnel down accumulator
167 int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
168 if (CHANNELS == 1) {
169 // duplicate accum to both L and R
170 outAccum = vpadd_s32(outAccum, outAccum);
171 } else if (CHANNELS == 2) {
172 // accum2 contains R, fold in
173 int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
174 outAccum = vpadd_s32(outAccum, outAccum2);
175 }
176 outAccum = vqrdmulh_s32(outAccum, vLR);
177 outSamp = vqadd_s32(outSamp, outAccum);
178 vst1_s32(out, outSamp);
179 }
180
181 template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(int32_t * out,int count,const int32_t * coefsP,const int32_t * coefsN,const int16_t * sP,const int16_t * sN,const int32_t * volumeLR,uint32_t lerpP,const int32_t * coefsP1,const int32_t * coefsN1)182 static inline void ProcessNeonIntrinsic(int32_t* out,
183 int count,
184 const int32_t* coefsP,
185 const int32_t* coefsN,
186 const int16_t* sP,
187 const int16_t* sN,
188 const int32_t* volumeLR,
189 uint32_t lerpP,
190 const int32_t* coefsP1,
191 const int32_t* coefsN1)
192 {
193 ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
194 static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
195
196 sP -= CHANNELS*((STRIDE>>1)-1);
197 coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
198 coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
199
200 int32x2_t interp;
201 if (!FIXED) {
202 interp = vdup_n_s32(lerpP);
203 coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
204 coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
205 }
206 int32x4_t accum, accum2;
207 // warning uninitialized if we use veorq_s32
208 // (alternative to below) accum = veorq_s32(accum, accum);
209 accum = vdupq_n_s32(0);
210 if (CHANNELS == 2) {
211 // (alternative to below) accum2 = veorq_s32(accum2, accum2);
212 accum2 = vdupq_n_s32(0);
213 }
214 do {
215 #ifdef vld1q_s32_x2
216 int32x4x2_t posCoef = vld1q_s32_x2(coefsP);
217 coefsP += 8;
218 int32x4x2_t negCoef = vld1q_s32_x2(coefsN);
219 coefsN += 8;
220 #else
221 int32x4x2_t posCoef;
222 posCoef.val[0] = vld1q_s32(coefsP);
223 coefsP += 4;
224 posCoef.val[1] = vld1q_s32(coefsP);
225 coefsP += 4;
226 int32x4x2_t negCoef;
227 negCoef.val[0] = vld1q_s32(coefsN);
228 coefsN += 4;
229 negCoef.val[1] = vld1q_s32(coefsN);
230 coefsN += 4;
231 #endif
232 if (!FIXED) { // interpolate
233 #ifdef vld1q_s32_x2
234 int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1);
235 coefsP1 += 8;
236 int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1);
237 coefsN1 += 8;
238 #else
239 int32x4x2_t posCoef1;
240 posCoef1.val[0] = vld1q_s32(coefsP1);
241 coefsP1 += 4;
242 posCoef1.val[1] = vld1q_s32(coefsP1);
243 coefsP1 += 4;
244 int32x4x2_t negCoef1;
245 negCoef1.val[0] = vld1q_s32(coefsN1);
246 coefsN1 += 4;
247 negCoef1.val[1] = vld1q_s32(coefsN1);
248 coefsN1 += 4;
249 #endif
250
251 posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]);
252 posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]);
253 negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]);
254 negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]);
255
256 posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0);
257 posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0);
258 negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0);
259 negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0);
260
261 posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]);
262 posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]);
263 negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]);
264 negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]);
265 }
266 switch (CHANNELS) {
267 case 1: {
268 int16x8_t posSamp = vld1q_s16(sP);
269 int16x8_t negSamp = vld1q_s16(sN);
270 sN += 8;
271 posSamp = vrev64q_s16(posSamp);
272
273 int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15);
274 int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15);
275 int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15);
276 int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15);
277
278 // dot product
279 posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
280 posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
281 negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
282 negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
283
284 accum = vaddq_s32(accum, posSamp0);
285 negSamp0 = vaddq_s32(negSamp0, negSamp1);
286 accum = vaddq_s32(accum, posSamp1);
287 accum = vaddq_s32(accum, negSamp0);
288
289 sP -= 8;
290 } break;
291 case 2: {
292 int16x8x2_t posSamp = vld2q_s16(sP);
293 int16x8x2_t negSamp = vld2q_s16(sN);
294 sN += 16;
295 posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
296 posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
297
298 // left
299 int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15);
300 int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15);
301 int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15);
302 int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15);
303
304 // dot product
305 posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
306 posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
307 negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
308 negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
309
310 accum = vaddq_s32(accum, posSamp0);
311 negSamp0 = vaddq_s32(negSamp0, negSamp1);
312 accum = vaddq_s32(accum, posSamp1);
313 accum = vaddq_s32(accum, negSamp0);
314
315 // right
316 posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15);
317 posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15);
318 negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15);
319 negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15);
320
321 // dot product
322 posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
323 posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
324 negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
325 negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
326
327 accum2 = vaddq_s32(accum2, posSamp0);
328 negSamp0 = vaddq_s32(negSamp0, negSamp1);
329 accum2 = vaddq_s32(accum2, posSamp1);
330 accum2 = vaddq_s32(accum2, negSamp0);
331
332 sP -= 16;
333 } break;
334 }
335 } while (count -= 8);
336
337 // multiply by volume and save
338 volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
339 int32x2_t vLR = vld1_s32(volumeLR);
340 int32x2_t outSamp = vld1_s32(out);
341 // combine and funnel down accumulator
342 int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
343 if (CHANNELS == 1) {
344 // duplicate accum to both L and R
345 outAccum = vpadd_s32(outAccum, outAccum);
346 } else if (CHANNELS == 2) {
347 // accum2 contains R, fold in
348 int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
349 outAccum = vpadd_s32(outAccum, outAccum2);
350 }
351 outAccum = vqrdmulh_s32(outAccum, vLR);
352 outSamp = vqadd_s32(outSamp, outAccum);
353 vst1_s32(out, outSamp);
354 }
355
356 template <int CHANNELS, int STRIDE, bool FIXED>
ProcessNeonIntrinsic(float * out,int count,const float * coefsP,const float * coefsN,const float * sP,const float * sN,const float * volumeLR,float lerpP,const float * coefsP1,const float * coefsN1)357 static inline void ProcessNeonIntrinsic(float* out,
358 int count,
359 const float* coefsP,
360 const float* coefsN,
361 const float* sP,
362 const float* sN,
363 const float* volumeLR,
364 float lerpP,
365 const float* coefsP1,
366 const float* coefsN1)
367 {
368 ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
369 static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
370
371 sP -= CHANNELS*((STRIDE>>1)-1);
372 coefsP = (const float*)__builtin_assume_aligned(coefsP, 16);
373 coefsN = (const float*)__builtin_assume_aligned(coefsN, 16);
374
375 float32x2_t interp;
376 if (!FIXED) {
377 interp = vdup_n_f32(lerpP);
378 coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16);
379 coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16);
380 }
381 float32x4_t accum, accum2;
382 // warning uninitialized if we use veorq_s32
383 // (alternative to below) accum = veorq_s32(accum, accum);
384 accum = vdupq_n_f32(0);
385 if (CHANNELS == 2) {
386 // (alternative to below) accum2 = veorq_s32(accum2, accum2);
387 accum2 = vdupq_n_f32(0);
388 }
389 do {
390 #ifdef vld1q_f32_x2
391 float32x4x2_t posCoef = vld1q_f32_x2(coefsP);
392 coefsP += 8;
393 float32x4x2_t negCoef = vld1q_f32_x2(coefsN);
394 coefsN += 8;
395 #else
396 float32x4x2_t posCoef;
397 posCoef.val[0] = vld1q_f32(coefsP);
398 coefsP += 4;
399 posCoef.val[1] = vld1q_f32(coefsP);
400 coefsP += 4;
401 float32x4x2_t negCoef;
402 negCoef.val[0] = vld1q_f32(coefsN);
403 coefsN += 4;
404 negCoef.val[1] = vld1q_f32(coefsN);
405 coefsN += 4;
406 #endif
407 if (!FIXED) { // interpolate
408 #ifdef vld1q_f32_x2
409 float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1);
410 coefsP1 += 8;
411 float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1);
412 coefsN1 += 8;
413 #else
414 float32x4x2_t posCoef1;
415 posCoef1.val[0] = vld1q_f32(coefsP1);
416 coefsP1 += 4;
417 posCoef1.val[1] = vld1q_f32(coefsP1);
418 coefsP1 += 4;
419 float32x4x2_t negCoef1;
420 negCoef1.val[0] = vld1q_f32(coefsN1);
421 coefsN1 += 4;
422 negCoef1.val[1] = vld1q_f32(coefsN1);
423 coefsN1 += 4;
424 #endif
425 posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]);
426 posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]);
427 negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]);
428 negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]);
429
430 posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0);
431 posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0);
432 negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev
433 negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev
434 }
435 switch (CHANNELS) {
436 case 1: {
437 #ifdef vld1q_f32_x2
438 float32x4x2_t posSamp = vld1q_f32_x2(sP);
439 float32x4x2_t negSamp = vld1q_f32_x2(sN);
440 sN += 8;
441 sP -= 8;
442 #else
443 float32x4x2_t posSamp;
444 posSamp.val[0] = vld1q_f32(sP);
445 sP += 4;
446 posSamp.val[1] = vld1q_f32(sP);
447 sP -= 12;
448 float32x4x2_t negSamp;
449 negSamp.val[0] = vld1q_f32(sN);
450 sN += 4;
451 negSamp.val[1] = vld1q_f32(sN);
452 sN += 4;
453 #endif
454 // effectively we want a vrev128q_f32()
455 posSamp.val[0] = vrev64q_f32(posSamp.val[0]);
456 posSamp.val[1] = vrev64q_f32(posSamp.val[1]);
457 posSamp.val[0] = vcombine_f32(
458 vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0]));
459 posSamp.val[1] = vcombine_f32(
460 vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1]));
461
462 accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]);
463 accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]);
464 accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]);
465 accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]);
466 } break;
467 case 2: {
468 float32x4x2_t posSamp0 = vld2q_f32(sP);
469 sP += 8;
470 float32x4x2_t negSamp0 = vld2q_f32(sN);
471 sN += 8;
472 posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]);
473 posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]);
474 posSamp0.val[0] = vcombine_f32(
475 vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0]));
476 posSamp0.val[1] = vcombine_f32(
477 vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1]));
478
479 float32x4x2_t posSamp1 = vld2q_f32(sP);
480 sP -= 24;
481 float32x4x2_t negSamp1 = vld2q_f32(sN);
482 sN += 8;
483 posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]);
484 posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]);
485 posSamp1.val[0] = vcombine_f32(
486 vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0]));
487 posSamp1.val[1] = vcombine_f32(
488 vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1]));
489
490 // Note: speed is affected by accumulation order.
491 // Also, speed appears slower using vmul/vadd instead of vmla for
492 // stereo case, comparable for mono.
493
494 accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]);
495 accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]);
496 accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]);
497 accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]);
498
499 accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed
500 accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed
501 accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed
502 accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed
503 } break;
504 }
505 } while (count -= 8);
506
507 // multiply by volume and save
508 volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8);
509 float32x2_t vLR = vld1_f32(volumeLR);
510 float32x2_t outSamp = vld1_f32(out);
511 // combine and funnel down accumulator
512 float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
513 if (CHANNELS == 1) {
514 // duplicate accum to both L and R
515 outAccum = vpadd_f32(outAccum, outAccum);
516 } else if (CHANNELS == 2) {
517 // accum2 contains R, fold in
518 float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
519 outAccum = vpadd_f32(outAccum, outAccum2);
520 }
521 outSamp = vmla_f32(outSamp, outAccum, vLR);
522 vst1_f32(out, outSamp);
523 }
524
525 template <>
526 inline void ProcessL<1, 16>(int32_t* const out,
527 int count,
528 const int16_t* coefsP,
529 const int16_t* coefsN,
530 const int16_t* sP,
531 const int16_t* sN,
532 const int32_t* const volumeLR)
533 {
534 #ifdef USE_INTRINSIC
535 ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
536 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
537 #else
538 const int CHANNELS = 1; // template specialization does not preserve params
539 const int STRIDE = 16;
540 sP -= CHANNELS*((STRIDE>>1)-1);
541 asm (
542 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
543
544 "1: \n"
545
546 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
547 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
548 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
549 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
550
551 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
552
553 // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
554 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef
555 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef
556 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
557 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
558
559 // moving these ARM instructions before neon above seems to be slower
560 "subs %[count], %[count], #8 \n"// (1) update loop counter
561 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
562
563 // sP used after branch (warning)
564 "bne 1b \n"// loop
565
566 ASSEMBLY_ACCUMULATE_MONO
567
568 : [out] "=Uv" (out[0]),
569 [count] "+r" (count),
570 [coefsP0] "+r" (coefsP),
571 [coefsN0] "+r" (coefsN),
572 [sP] "+r" (sP),
573 [sN] "+r" (sN)
574 : [vLR] "r" (volumeLR)
575 : "cc", "memory",
576 "q0", "q1", "q2", "q3",
577 "q8", "q10"
578 );
579 #endif
580 }
581
582 template <>
583 inline void ProcessL<2, 16>(int32_t* const out,
584 int count,
585 const int16_t* coefsP,
586 const int16_t* coefsN,
587 const int16_t* sP,
588 const int16_t* sN,
589 const int32_t* const volumeLR)
590 {
591 #ifdef USE_INTRINSIC
592 ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
593 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
594 #else
595 const int CHANNELS = 2; // template specialization does not preserve params
596 const int STRIDE = 16;
597 sP -= CHANNELS*((STRIDE>>1)-1);
598 asm (
599 "veor q0, q0, q0 \n"// (1) acc_L = 0
600 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
601
602 "1: \n"
603
604 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
605 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
606 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
607 "vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
608
609 "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
610 "vrev64.16 q3, q3 \n"// (0 combines+) reverse positive right
611
612 "vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left
613 "vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left
614 "vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right
615 "vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right
616 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
617 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
618 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
619 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
620
621 // moving these ARM before neon seems to be slower
622 "subs %[count], %[count], #8 \n"// (1) update loop counter
623 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
624
625 // sP used after branch (warning)
626 "bne 1b \n"// loop
627
628 ASSEMBLY_ACCUMULATE_STEREO
629
630 : [out] "=Uv" (out[0]),
631 [count] "+r" (count),
632 [coefsP0] "+r" (coefsP),
633 [coefsN0] "+r" (coefsN),
634 [sP] "+r" (sP),
635 [sN] "+r" (sN)
636 : [vLR] "r" (volumeLR)
637 : "cc", "memory",
638 "q0", "q1", "q2", "q3",
639 "q4", "q5", "q6",
640 "q8", "q10"
641 );
642 #endif
643 }
644
645 template <>
646 inline void Process<1, 16>(int32_t* const out,
647 int count,
648 const int16_t* coefsP,
649 const int16_t* coefsN,
650 const int16_t* coefsP1,
651 const int16_t* coefsN1,
652 const int16_t* sP,
653 const int16_t* sN,
654 uint32_t lerpP,
655 const int32_t* const volumeLR)
656 {
657 #ifdef USE_INTRINSIC
658 ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
659 lerpP, coefsP1, coefsN1);
660 #else
661
662 const int CHANNELS = 1; // template specialization does not preserve params
663 const int STRIDE = 16;
664 sP -= CHANNELS*((STRIDE>>1)-1);
665 asm (
666 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
667 "veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
668
669 "1: \n"
670
671 "vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
672 "vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
673 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
674 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
675 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
676 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
677
678 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
679 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
680
681 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
682 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
683
684 "vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
685
686 "vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set
687 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
688
689 // reordering the vmal to do d6, d7 before d4, d5 is slower(?)
690 "vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef
691 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef
692 "vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
693 "vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
694
695 // moving these ARM instructions before neon above seems to be slower
696 "subs %[count], %[count], #8 \n"// (1) update loop counter
697 "sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
698
699 // sP used after branch (warning)
700 "bne 1b \n"// loop
701
702 ASSEMBLY_ACCUMULATE_MONO
703
704 : [out] "=Uv" (out[0]),
705 [count] "+r" (count),
706 [coefsP0] "+r" (coefsP),
707 [coefsN0] "+r" (coefsN),
708 [coefsP1] "+r" (coefsP1),
709 [coefsN1] "+r" (coefsN1),
710 [sP] "+r" (sP),
711 [sN] "+r" (sN)
712 : [lerpP] "r" (lerpP),
713 [vLR] "r" (volumeLR)
714 : "cc", "memory",
715 "q0", "q1", "q2", "q3",
716 "q8", "q9", "q10", "q11"
717 );
718 #endif
719 }
720
721 template <>
722 inline void Process<2, 16>(int32_t* const out,
723 int count,
724 const int16_t* coefsP,
725 const int16_t* coefsN,
726 const int16_t* coefsP1,
727 const int16_t* coefsN1,
728 const int16_t* sP,
729 const int16_t* sN,
730 uint32_t lerpP,
731 const int32_t* const volumeLR)
732 {
733 #ifdef USE_INTRINSIC
734 ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
735 lerpP, coefsP1, coefsN1);
736 #else
737 const int CHANNELS = 2; // template specialization does not preserve params
738 const int STRIDE = 16;
739 sP -= CHANNELS*((STRIDE>>1)-1);
740 asm (
741 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
742 "veor q0, q0, q0 \n"// (1) acc_L = 0
743 "veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
744
745 "1: \n"
746
747 "vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
748 "vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
749 "vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
750 "vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
751 "vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
752 "vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
753
754 "vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
755 "vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
756
757 "vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
758 "vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
759
760 "vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
761 "vrev64.16 q3, q3 \n"// (1) reverse 8 samples of positive right
762
763 "vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set
764 "vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
765
766 "vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left
767 "vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left
768 "vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right
769 "vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right
770 "vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
771 "vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
772 "vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
773 "vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
774
775 // moving these ARM before neon seems to be slower
776 "subs %[count], %[count], #8 \n"// (1) update loop counter
777 "sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
778
779 // sP used after branch (warning)
780 "bne 1b \n"// loop
781
782 ASSEMBLY_ACCUMULATE_STEREO
783
784 : [out] "=Uv" (out[0]),
785 [count] "+r" (count),
786 [coefsP0] "+r" (coefsP),
787 [coefsN0] "+r" (coefsN),
788 [coefsP1] "+r" (coefsP1),
789 [coefsN1] "+r" (coefsN1),
790 [sP] "+r" (sP),
791 [sN] "+r" (sN)
792 : [lerpP] "r" (lerpP),
793 [vLR] "r" (volumeLR)
794 : "cc", "memory",
795 "q0", "q1", "q2", "q3",
796 "q4", "q5", "q6",
797 "q8", "q9", "q10", "q11"
798 );
799 #endif
800 }
801
802 template <>
803 inline void ProcessL<1, 16>(int32_t* const out,
804 int count,
805 const int32_t* coefsP,
806 const int32_t* coefsN,
807 const int16_t* sP,
808 const int16_t* sN,
809 const int32_t* const volumeLR)
810 {
811 #ifdef USE_INTRINSIC
812 ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
813 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
814 #else
815 const int CHANNELS = 1; // template specialization does not preserve params
816 const int STRIDE = 16;
817 sP -= CHANNELS*((STRIDE>>1)-1);
818 asm (
819 "veor q0, q0, q0 \n"// result, initialize to 0
820
821 "1: \n"
822
823 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
824 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
825 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
826 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
827
828 "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
829
830 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
831 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
832
833 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
834 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
835
836 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples
837 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples
838 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples
839 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples
840
841 "vadd.s32 q0, q0, q12 \n"// accumulate result
842 "vadd.s32 q13, q13, q14 \n"// accumulate result
843 "vadd.s32 q0, q0, q15 \n"// accumulate result
844 "vadd.s32 q0, q0, q13 \n"// accumulate result
845
846 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
847 "subs %[count], %[count], #8 \n"// update loop counter
848
849 "bne 1b \n"// loop
850
851 ASSEMBLY_ACCUMULATE_MONO
852
853 : [out] "=Uv" (out[0]),
854 [count] "+r" (count),
855 [coefsP0] "+r" (coefsP),
856 [coefsN0] "+r" (coefsN),
857 [sP] "+r" (sP),
858 [sN] "+r" (sN)
859 : [vLR] "r" (volumeLR)
860 : "cc", "memory",
861 "q0", "q1", "q2", "q3",
862 "q8", "q9", "q10", "q11",
863 "q12", "q13", "q14", "q15"
864 );
865 #endif
866 }
867
868 template <>
869 inline void ProcessL<2, 16>(int32_t* const out,
870 int count,
871 const int32_t* coefsP,
872 const int32_t* coefsN,
873 const int16_t* sP,
874 const int16_t* sN,
875 const int32_t* const volumeLR)
876 {
877 #ifdef USE_INTRINSIC
878 ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
879 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
880 #else
881 const int CHANNELS = 2; // template specialization does not preserve params
882 const int STRIDE = 16;
883 sP -= CHANNELS*((STRIDE>>1)-1);
884 asm (
885 "veor q0, q0, q0 \n"// result, initialize to 0
886 "veor q4, q4, q4 \n"// result, initialize to 0
887
888 "1: \n"
889
890 "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
891 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
892 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
893 "vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
894
895 "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
896 "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
897
898 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
899 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
900
901 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
902 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
903
904 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
905 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
906 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
907 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
908
909 "vadd.s32 q0, q0, q12 \n"// accumulate result
910 "vadd.s32 q13, q13, q14 \n"// accumulate result
911 "vadd.s32 q0, q0, q15 \n"// accumulate result
912 "vadd.s32 q0, q0, q13 \n"// accumulate result
913
914 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
915 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
916
917 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
918 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
919
920 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
921 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
922 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
923 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
924
925 "vadd.s32 q4, q4, q12 \n"// accumulate result
926 "vadd.s32 q13, q13, q14 \n"// accumulate result
927 "vadd.s32 q4, q4, q15 \n"// accumulate result
928 "vadd.s32 q4, q4, q13 \n"// accumulate result
929
930 "subs %[count], %[count], #8 \n"// update loop counter
931 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
932
933 "bne 1b \n"// loop
934
935 ASSEMBLY_ACCUMULATE_STEREO
936
937 : [out] "=Uv" (out[0]),
938 [count] "+r" (count),
939 [coefsP0] "+r" (coefsP),
940 [coefsN0] "+r" (coefsN),
941 [sP] "+r" (sP),
942 [sN] "+r" (sN)
943 : [vLR] "r" (volumeLR)
944 : "cc", "memory",
945 "q0", "q1", "q2", "q3",
946 "q4", "q5", "q6",
947 "q8", "q9", "q10", "q11",
948 "q12", "q13", "q14", "q15"
949 );
950 #endif
951 }
952
953 template <>
954 inline void Process<1, 16>(int32_t* const out,
955 int count,
956 const int32_t* coefsP,
957 const int32_t* coefsN,
958 const int32_t* coefsP1,
959 const int32_t* coefsN1,
960 const int16_t* sP,
961 const int16_t* sN,
962 uint32_t lerpP,
963 const int32_t* const volumeLR)
964 {
965 #ifdef USE_INTRINSIC
966 ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
967 lerpP, coefsP1, coefsN1);
968 #else
969 const int CHANNELS = 1; // template specialization does not preserve params
970 const int STRIDE = 16;
971 sP -= CHANNELS*((STRIDE>>1)-1);
972 asm (
973 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
974 "veor q0, q0, q0 \n"// result, initialize to 0
975
976 "1: \n"
977
978 "vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
979 "vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
980 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
981 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
982 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
983 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
984
985 "vsub.s32 q12, q12, q8 \n"// interpolate (step1)
986 "vsub.s32 q13, q13, q9 \n"// interpolate (step1)
987 "vsub.s32 q14, q14, q10 \n"// interpolate (step1)
988 "vsub.s32 q15, q15, q11 \n"// interpolate (step1)
989
990 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
991 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
992 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
993 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
994
995 "vadd.s32 q8, q8, q12 \n"// interpolate (step3)
996 "vadd.s32 q9, q9, q13 \n"// interpolate (step3)
997 "vadd.s32 q10, q10, q14 \n"// interpolate (step3)
998 "vadd.s32 q11, q11, q15 \n"// interpolate (step3)
999
1000 "vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
1001
1002 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
1003 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
1004
1005 "vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
1006 "vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
1007
1008 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
1009 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
1010 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
1011 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
1012
1013 "vadd.s32 q0, q0, q12 \n"// accumulate result
1014 "vadd.s32 q13, q13, q14 \n"// accumulate result
1015 "vadd.s32 q0, q0, q15 \n"// accumulate result
1016 "vadd.s32 q0, q0, q13 \n"// accumulate result
1017
1018 "sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
1019 "subs %[count], %[count], #8 \n"// update loop counter
1020
1021 "bne 1b \n"// loop
1022
1023 ASSEMBLY_ACCUMULATE_MONO
1024
1025 : [out] "=Uv" (out[0]),
1026 [count] "+r" (count),
1027 [coefsP0] "+r" (coefsP),
1028 [coefsN0] "+r" (coefsN),
1029 [coefsP1] "+r" (coefsP1),
1030 [coefsN1] "+r" (coefsN1),
1031 [sP] "+r" (sP),
1032 [sN] "+r" (sN)
1033 : [lerpP] "r" (lerpP),
1034 [vLR] "r" (volumeLR)
1035 : "cc", "memory",
1036 "q0", "q1", "q2", "q3",
1037 "q8", "q9", "q10", "q11",
1038 "q12", "q13", "q14", "q15"
1039 );
1040 #endif
1041 }
1042
1043 template <>
1044 inline void Process<2, 16>(int32_t* const out,
1045 int count,
1046 const int32_t* coefsP,
1047 const int32_t* coefsN,
1048 const int32_t* coefsP1,
1049 const int32_t* coefsN1,
1050 const int16_t* sP,
1051 const int16_t* sN,
1052 uint32_t lerpP,
1053 const int32_t* const volumeLR)
1054 {
1055 #ifdef USE_INTRINSIC
1056 ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1057 lerpP, coefsP1, coefsN1);
1058 #else
1059 const int CHANNELS = 2; // template specialization does not preserve params
1060 const int STRIDE = 16;
1061 sP -= CHANNELS*((STRIDE>>1)-1);
1062 asm (
1063 "vmov.32 d2[0], %[lerpP] \n"// load the positive phase
1064 "veor q0, q0, q0 \n"// result, initialize to 0
1065 "veor q4, q4, q4 \n"// result, initialize to 0
1066
1067 "1: \n"
1068
1069 "vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
1070 "vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
1071 "vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
1072 "vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
1073 "vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
1074 "vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
1075
1076 "vsub.s32 q12, q12, q8 \n"// interpolate (step1)
1077 "vsub.s32 q13, q13, q9 \n"// interpolate (step1)
1078 "vsub.s32 q14, q14, q10 \n"// interpolate (step1)
1079 "vsub.s32 q15, q15, q11 \n"// interpolate (step1)
1080
1081 "vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
1082 "vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
1083 "vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
1084 "vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
1085
1086 "vadd.s32 q8, q8, q12 \n"// interpolate (step3)
1087 "vadd.s32 q9, q9, q13 \n"// interpolate (step3)
1088 "vadd.s32 q10, q10, q14 \n"// interpolate (step3)
1089 "vadd.s32 q11, q11, q15 \n"// interpolate (step3)
1090
1091 "vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
1092 "vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
1093
1094 "vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
1095 "vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
1096
1097 "vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
1098 "vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
1099
1100 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
1101 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
1102 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
1103 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
1104
1105 "vadd.s32 q0, q0, q12 \n"// accumulate result
1106 "vadd.s32 q13, q13, q14 \n"// accumulate result
1107 "vadd.s32 q0, q0, q15 \n"// accumulate result
1108 "vadd.s32 q0, q0, q13 \n"// accumulate result
1109
1110 "vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
1111 "vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
1112
1113 "vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
1114 "vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
1115
1116 "vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
1117 "vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
1118 "vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
1119 "vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
1120
1121 "vadd.s32 q4, q4, q12 \n"// accumulate result
1122 "vadd.s32 q13, q13, q14 \n"// accumulate result
1123 "vadd.s32 q4, q4, q15 \n"// accumulate result
1124 "vadd.s32 q4, q4, q13 \n"// accumulate result
1125
1126 "subs %[count], %[count], #8 \n"// update loop counter
1127 "sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
1128
1129 "bne 1b \n"// loop
1130
1131 ASSEMBLY_ACCUMULATE_STEREO
1132
1133 : [out] "=Uv" (out[0]),
1134 [count] "+r" (count),
1135 [coefsP0] "+r" (coefsP),
1136 [coefsN0] "+r" (coefsN),
1137 [coefsP1] "+r" (coefsP1),
1138 [coefsN1] "+r" (coefsN1),
1139 [sP] "+r" (sP),
1140 [sN] "+r" (sN)
1141 : [lerpP] "r" (lerpP),
1142 [vLR] "r" (volumeLR)
1143 : "cc", "memory",
1144 "q0", "q1", "q2", "q3",
1145 "q4", "q5", "q6",
1146 "q8", "q9", "q10", "q11",
1147 "q12", "q13", "q14", "q15"
1148 );
1149 #endif
1150 }
1151
1152 template<>
1153 inline void ProcessL<1, 16>(float* const out,
1154 int count,
1155 const float* coefsP,
1156 const float* coefsN,
1157 const float* sP,
1158 const float* sN,
1159 const float* const volumeLR)
1160 {
1161 ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1162 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
1163 }
1164
1165 template<>
1166 inline void ProcessL<2, 16>(float* const out,
1167 int count,
1168 const float* coefsP,
1169 const float* coefsN,
1170 const float* sP,
1171 const float* sN,
1172 const float* const volumeLR)
1173 {
1174 ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1175 0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
1176 }
1177
1178 template<>
1179 inline void Process<1, 16>(float* const out,
1180 int count,
1181 const float* coefsP,
1182 const float* coefsN,
1183 const float* coefsP1,
1184 const float* coefsN1,
1185 const float* sP,
1186 const float* sN,
1187 float lerpP,
1188 const float* const volumeLR)
1189 {
1190 ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1191 lerpP, coefsP1, coefsN1);
1192 }
1193
1194 template<>
1195 inline void Process<2, 16>(float* const out,
1196 int count,
1197 const float* coefsP,
1198 const float* coefsN,
1199 const float* coefsP1,
1200 const float* coefsN1,
1201 const float* sP,
1202 const float* sN,
1203 float lerpP,
1204 const float* const volumeLR)
1205 {
1206 ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
1207 lerpP, coefsP1, coefsN1);
1208 }
1209
1210 #endif //USE_NEON
1211
1212 } // namespace android
1213
1214 #endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
1215