1 /***
2 This file is part of PulseAudio.
3
4 Copyright 2013 Peter Meerwald <p.meerwald@bct-electronic.com>
5
6 PulseAudio is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as published
8 by the Free Software Foundation; either version 2.1 of the License,
9 or (at your option) any later version.
10
11 PulseAudio is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15 ***/
16
17 #ifdef HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20
21 #include <pulse/sample.h>
22 #include <pulse/xmalloc.h>
23 #include <pulsecore/log.h>
24 #include <pulsecore/macro.h>
25
26 #include "cpu-arm.h"
27 #include "remap.h"
28
29 #include <arm_neon.h>
30
remap_mono_to_stereo_float32ne_neon_a8(pa_remap_t * m,float * dst,const float * src,unsigned n)31 static void remap_mono_to_stereo_float32ne_neon_a8(pa_remap_t *m, float *dst, const float *src, unsigned n) {
32 for (; n >= 4; n -= 4) {
33 __asm__ __volatile__ (
34 "vld1.32 {q0}, [%[src]]! \n\t"
35 "vmov q1, q0 \n\t"
36 "vst2.32 {q0,q1}, [%[dst]]! \n\t"
37 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
38 : /* input operands */
39 : "memory", "q0", "q1" /* clobber list */
40 );
41 }
42
43 for (; n > 0; n--) {
44 dst[0] = dst[1] = src[0];
45 src++;
46 dst += 2;
47 }
48 }
49
remap_mono_to_stereo_float32ne_generic_arm(pa_remap_t * m,float * dst,const float * src,unsigned n)50 static void remap_mono_to_stereo_float32ne_generic_arm(pa_remap_t *m, float *dst, const float *src, unsigned n) {
51 for (; n >= 2; n -= 2) {
52 __asm__ __volatile__ (
53 "ldm %[src]!, {r4,r6} \n\t"
54 "mov r5, r4 \n\t"
55
56 /* We use r12 instead of r7 here, because r7 is reserved for the
57 * frame pointer when using Thumb. */
58 "mov r12, r6 \n\t"
59
60 "stm %[dst]!, {r4-r6,r12} \n\t"
61 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
62 : /* input operands */
63 : "memory", "r4", "r5", "r6", "r12" /* clobber list */
64 );
65 }
66
67 if (n > 0)
68 dst[0] = dst[1] = src[0];
69 }
70
remap_mono_to_stereo_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)71 static void remap_mono_to_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
72 for (; n >= 8; n -= 8) {
73 __asm__ __volatile__ (
74 "vld1.16 {q0}, [%[src]]! \n\t"
75 "vmov q1, q0 \n\t"
76 "vst2.16 {q0,q1}, [%[dst]]! \n\t"
77 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
78 : /* input operands */
79 : "memory", "q0", "q1" /* clobber list */
80 );
81 }
82
83 for (; n > 0; n--) {
84 dst[0] = dst[1] = src[0];
85 src++;
86 dst += 2;
87 }
88 }
89
remap_mono_to_ch4_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)90 static void remap_mono_to_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
91 for (; n >= 2; n -= 2) {
92 __asm__ __volatile__ (
93 "vld1.32 {d0}, [%[src]]! \n\t"
94 "vdup.f32 q1, d0[0] \n\t"
95 "vdup.f32 q2, d0[1] \n\t"
96 "vst1.32 {q1,q2}, [%[dst]]! \n\t"
97 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
98 : /* input operands */
99 : "memory", "q0", "q1", "q2" /* clobber list */
100 );
101 }
102
103 if (n--)
104 dst[0] = dst[1] = dst[2] = dst[3] = src[0];
105 }
106
remap_mono_to_ch4_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)107 static void remap_mono_to_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
108 for (; n >= 4; n -= 4) {
109 __asm__ __volatile__ (
110 "vld1.16 {d0}, [%[src]]! \n\t"
111 "vdup.s16 d1, d0[1] \n\t"
112 "vdup.s16 d2, d0[2] \n\t"
113 "vdup.s16 d3, d0[3] \n\t"
114 "vdup.s16 d0, d0[0] \n\t"
115 "vst1.16 {d0,d1,d2,d3}, [%[dst]]!\n\t"
116 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
117 : /* input operands */
118 : "memory", "d0", "d1", "d2", "d3" /* clobber list */
119 );
120 }
121
122 for (; n > 0; n--) {
123 dst[0] = dst[1] = dst[2] = dst[3] = src[0];
124 src++;
125 dst += 4;
126 }
127 }
128
remap_stereo_to_mono_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)129 static void remap_stereo_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
130 const float32x4_t halve = vdupq_n_f32(0.5f);
131 for (; n >= 4; n -= 4) {
132 __asm__ __volatile__ (
133 "vld2.32 {q0,q1}, [%[src]]! \n\t"
134 "vadd.f32 q0, q0, q1 \n\t"
135 "vmul.f32 q0, q0, %q[halve] \n\t"
136 "vst1.32 {q0}, [%[dst]]! \n\t"
137 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
138 : [halve] "w" (halve) /* input operands */
139 : "memory", "q0", "q1" /* clobber list */
140 );
141 }
142
143 for (; n > 0; n--) {
144 dst[0] = (src[0] + src[1])*0.5f;
145 src += 2;
146 dst++;
147 }
148 }
149
remap_stereo_to_mono_s32ne_neon(pa_remap_t * m,int32_t * dst,const int32_t * src,unsigned n)150 static void remap_stereo_to_mono_s32ne_neon(pa_remap_t *m, int32_t *dst, const int32_t *src, unsigned n) {
151 for (; n >= 4; n -= 4) {
152 __asm__ __volatile__ (
153 "vld2.32 {q0,q1}, [%[src]]! \n\t"
154 "vrhadd.s32 q0, q0, q1 \n\t"
155 "vst1.32 {q0}, [%[dst]]! \n\t"
156 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
157 : /* input operands */
158 : "memory", "q0", "q1" /* clobber list */
159 );
160 }
161
162 for (; n > 0; n--) {
163 dst[0] = src[0]/2 + src[1]/2;
164 src += 2;
165 dst++;
166 }
167 }
168
remap_stereo_to_mono_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)169 static void remap_stereo_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
170 for (; n >= 8; n -= 8) {
171 __asm__ __volatile__ (
172 "vld2.16 {q0,q1}, [%[src]]! \n\t"
173 "vrhadd.s16 q0, q0, q1 \n\t"
174 "vst1.16 {q0}, [%[dst]]! \n\t"
175 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
176 : /* input operands */
177 : "memory", "q0", "q1" /* clobber list */
178 );
179 }
180
181 for (; n > 0; n--) {
182 dst[0] = (src[0] + src[1])/2;
183 src += 2;
184 dst++;
185 }
186 }
187
remap_ch4_to_mono_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)188 static void remap_ch4_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
189 const float32x2_t quart = vdup_n_f32(0.25f);
190 for (; n >= 2; n -= 2) {
191 __asm__ __volatile__ (
192 "vld4.32 {d0,d1,d2,d3}, [%[src]]!\n\t"
193 "vadd.f32 d0, d0, d1 \n\t"
194 "vadd.f32 d2, d2, d3 \n\t"
195 "vadd.f32 d0, d0, d2 \n\t"
196 "vmul.f32 d0, d0, %P[quart] \n\t"
197 "vst1.32 {d0}, [%[dst]]! \n\t"
198 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
199 : [quart] "w" (quart) /* input operands */
200 : "memory", "d0", "d1", "d2", "d3" /* clobber list */
201 );
202 }
203
204 if (n > 0)
205 dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f;
206 }
207
remap_ch4_to_mono_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)208 static void remap_ch4_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
209 for (; n >= 4; n -= 4) {
210 __asm__ __volatile__ (
211 "vld4.16 {d0,d1,d2,d3}, [%[src]]!\n\t"
212 "vrhadd.s16 d0, d0, d1 \n\t"
213 "vrhadd.s16 d2, d2, d3 \n\t"
214 "vrhadd.s16 d0, d0, d2 \n\t"
215 "vst1.16 {d0}, [%[dst]]! \n\t"
216 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
217 : /* input operands */
218 : "memory", "d0", "d1", "d2", "d3" /* clobber list */
219 );
220 }
221
222 for (; n > 0; n--) {
223 dst[0] = (src[0] + src[1] + src[2] + src[3])/4;
224 src += 4;
225 dst++;
226 }
227 }
228
remap_ch4_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)229 static void remap_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
230 int32x4_t *f = m->state;
231 const int32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
232
233 for (; n > 0; n--) {
234 __asm__ __volatile__ (
235 "vld1.16 {d0}, [%[src]]! \n\t"
236 "vmovl.s16 q0, d0 \n\t"
237 "vdup.s32 q1, d0[0] \n\t"
238 "vmul.s32 q1, q1, %q[f0] \n\t"
239 "vdup.s32 q2, d0[1] \n\t"
240 "vmla.s32 q1, q2, %q[f1] \n\t"
241 "vdup.s32 q2, d1[0] \n\t"
242 "vmla.s32 q1, q2, %q[f2] \n\t"
243 "vdup.s32 q2, d1[1] \n\t"
244 "vmla.s32 q1, q2, %q[f3] \n\t"
245 "vqshrn.s32 d2, q1, #16 \n\t"
246 "vst1.32 {d2}, [%[dst]]! \n\t"
247 : [dst] "+r" (dst), [src] "+r" (src)
248 : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
249 : "memory", "q0", "q1", "q2"
250 );
251 }
252 }
253
remap_ch4_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)254 static void remap_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
255 float32x4_t *f = m->state;
256 const float32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
257
258 for (; n > 0; n--) {
259 __asm__ __volatile__ (
260 "vld1.32 {d0,d1}, [%[src]]! \n\t"
261 "vdup.f32 q1, d0[0] \n\t"
262 "vmul.f32 q1, q1, %q[f0] \n\t"
263 "vdup.f32 q2, d0[1] \n\t"
264 "vmla.f32 q1, q2, %q[f1] \n\t"
265 "vdup.f32 q2, d1[0] \n\t"
266 "vmla.f32 q1, q2, %q[f2] \n\t"
267 "vdup.f32 q2, d1[1] \n\t"
268 "vmla.f32 q1, q2, %q[f3] \n\t"
269 "vst1.32 {d2,d3}, [%[dst]]! \n\t"
270 : [dst] "+r" (dst), [src] "+r" (src)
271 : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
272 : "memory", "q0", "q1", "q2"
273 );
274 }
275 }
276
remap_arrange_stereo_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)277 static void remap_arrange_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
278 const uint8x8_t t = ((uint8x8_t *) m->state)[0];
279
280 for (; n >= 2; n -= 2) {
281 __asm__ __volatile__ (
282 "vld1.s16 d0, [%[src]]! \n\t"
283 "vtbl.8 d0, {d0}, %P[t] \n\t"
284 "vst1.s16 d0, [%[dst]]! \n\t"
285 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
286 : [t] "w" (t) /* input operands */
287 : "memory", "d0" /* clobber list */
288 );
289 }
290
291 if (n > 0) {
292 __asm__ __volatile__ (
293 "vld1.32 d0[0], [%[src]]! \n\t"
294 "vtbl.8 d0, {d0}, %P[t] \n\t"
295 "vst1.32 d0[0], [%[dst]]! \n\t"
296 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
297 : [t] "w" (t) /* input operands */
298 : "memory", "d0" /* clobber list */
299 );
300 }
301 }
302
remap_arrange_ch2_ch4_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)303 static void remap_arrange_ch2_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
304 const uint8x8_t t = ((uint8x8_t *) m->state)[0];
305
306 for (; n > 0; n--) {
307 __asm__ __volatile__ (
308 "vld1.32 d0[0], [%[src]]! \n\t"
309 "vtbl.8 d0, {d0}, %P[t] \n\t"
310 "vst1.s16 d0, [%[dst]]! \n\t"
311 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
312 : [t] "w" (t) /* input operands */
313 : "memory", "d0" /* clobber list */
314 );
315 }
316 }
317
remap_arrange_ch4_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)318 static void remap_arrange_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
319 const uint8x8_t t = ((uint8x8_t *) m->state)[0];
320
321 for (; n > 0; n--) {
322 __asm__ __volatile__ (
323 "vld1.s16 d0, [%[src]]! \n\t"
324 "vtbl.8 d0, {d0}, %P[t] \n\t"
325 "vst1.s16 d0, [%[dst]]! \n\t"
326 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
327 : [t] "w" (t) /* input operands */
328 : "memory", "d0" /* clobber list */
329 );
330 }
331 }
332
remap_arrange_stereo_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)333 static void remap_arrange_stereo_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
334 const uint8x8_t t = ((uint8x8_t *)m->state)[0];
335
336 for (; n > 0; n--) {
337 __asm__ __volatile__ (
338 "vld1.f32 d0, [%[src]]! \n\t"
339 "vtbl.8 d0, {d0}, %P[t] \n\t"
340 "vst1.s16 {d0}, [%[dst]]! \n\t"
341 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
342 : [t] "w" (t) /* input operands */
343 : "memory", "d0" /* clobber list */
344 );
345 }
346 }
347
348 /* Works for both S32NE and FLOAT32NE */
remap_arrange_ch2_ch4_any32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)349 static void remap_arrange_ch2_ch4_any32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
350 const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
351 const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
352
353 for (; n > 0; n--) {
354 __asm__ __volatile__ (
355 "vld1.f32 d0, [%[src]]! \n\t"
356 "vtbl.8 d1, {d0}, %P[t0] \n\t"
357 "vtbl.8 d2, {d0}, %P[t1] \n\t"
358 "vst1.s16 {d1,d2}, [%[dst]]! \n\t"
359 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
360 : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
361 : "memory", "d0", "d1", "d2" /* clobber list */
362 );
363 }
364 }
365
remap_arrange_ch4_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)366 static void remap_arrange_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
367 const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
368 const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
369
370 for (; n > 0; n--) {
371 __asm__ __volatile__ (
372 "vld1.f32 {d0,d1}, [%[src]]! \n\t"
373 "vtbl.8 d2, {d0,d1}, %P[t0] \n\t"
374 "vtbl.8 d3, {d0,d1}, %P[t1] \n\t"
375 "vst1.s16 {d2,d3}, [%[dst]]! \n\t"
376 : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
377 : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
378 : "memory", "d0", "d1", "d2", "d3" /* clobber list */
379 );
380 }
381 }
382
383 static pa_cpu_arm_flag_t arm_flags;
384
init_remap_neon(pa_remap_t * m)385 static void init_remap_neon(pa_remap_t *m) {
386 unsigned n_oc, n_ic;
387 int8_t arrange[PA_CHANNELS_MAX];
388
389 n_oc = m->o_ss.channels;
390 n_ic = m->i_ss.channels;
391
392 /* We short-circuit remap function selection for S32NE in most
393 * cases as the corresponding generic C code is performing
394 * similarly or even better. However there are a few cases where
395 * there actually is a significant improvement from using
396 * hand-crafted NEON assembly so we cannot just bail out for S32NE
397 * here. */
398 if (n_ic == 1 && n_oc == 2 &&
399 m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
400 if (m->format == PA_SAMPLE_S32NE)
401 return;
402 if (arm_flags & PA_CPU_ARM_CORTEX_A8) {
403
404 pa_log_info("Using ARM NEON/A8 mono to stereo remapping");
405 pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
406 NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8);
407 }
408 else {
409 pa_log_info("Using ARM NEON mono to stereo remapping");
410 pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
411 NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm);
412 }
413 } else if (n_ic == 1 && n_oc == 4 &&
414 m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 &&
415 m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) {
416
417 if (m->format == PA_SAMPLE_S32NE)
418 return;
419 pa_log_info("Using ARM NEON mono to 4-channel remapping");
420 pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_ch4_s16ne_neon,
421 NULL, (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon);
422 } else if (n_ic == 2 && n_oc == 1 &&
423 m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) {
424
425 pa_log_info("Using ARM NEON stereo to mono remapping");
426 pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_neon,
427 (pa_do_remap_func_t) remap_stereo_to_mono_s32ne_neon,
428 (pa_do_remap_func_t) remap_stereo_to_mono_float32ne_neon);
429 } else if (n_ic == 4 && n_oc == 1 &&
430 m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 &&
431 m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) {
432
433 if (m->format == PA_SAMPLE_S32NE)
434 return;
435 pa_log_info("Using ARM NEON 4-channel to mono remapping");
436 pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_neon,
437 NULL, (pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon);
438 } else if (pa_setup_remap_arrange(m, arrange) &&
439 ((n_ic == 2 && n_oc == 2) ||
440 (n_ic == 2 && n_oc == 4) ||
441 (n_ic == 4 && n_oc == 4))) {
442 unsigned o;
443
444 if (n_ic == 2 && n_oc == 2) {
445 if (m->format == PA_SAMPLE_S32NE)
446 return;
447 pa_log_info("Using NEON stereo arrange remapping");
448 pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_neon,
449 NULL, (pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon);
450 } else if (n_ic == 2 && n_oc == 4) {
451 pa_log_info("Using NEON 2-channel to 4-channel arrange remapping");
452 pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch2_ch4_s16ne_neon,
453 (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon,
454 (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon);
455 } else if (n_ic == 4 && n_oc == 4) {
456 if (m->format == PA_SAMPLE_S32NE)
457 return;
458 pa_log_info("Using NEON 4-channel arrange remapping");
459 pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_neon,
460 NULL, (pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon);
461 }
462
463 /* setup state */
464 switch (m->format) {
465 case PA_SAMPLE_S16NE: {
466 uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 1);
467 for (o = 0; o < 4; o++) {
468 if (arrange[o % n_oc] >= 0) {
469 /* convert channel index to vtbl indices */
470 unsigned frame = o / n_oc;
471 ((uint8_t *) t)[o * 2 + 0] = (frame * n_oc + arrange[o % n_oc]) * 2 + 0;
472 ((uint8_t *) t)[o * 2 + 1] = (frame * n_oc + arrange[o % n_oc]) * 2 + 1;
473 } else {
474 /* use invalid table indices to map to 0 */
475 ((uint8_t *) t)[o * 2 + 0] = 0xff;
476 ((uint8_t *) t)[o * 2 + 1] = 0xff;
477 }
478 }
479 break;
480 }
481 case PA_SAMPLE_S32NE:
482 /* fall-through */
483 case PA_SAMPLE_FLOAT32NE: {
484 uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 2);
485 for (o = 0; o < n_oc; o++) {
486 if (arrange[o] >= 0) {
487 /* convert channel index to vtbl indices */
488 ((uint8_t *) t)[o * 4 + 0] = arrange[o] * 4 + 0;
489 ((uint8_t *) t)[o * 4 + 1] = arrange[o] * 4 + 1;
490 ((uint8_t *) t)[o * 4 + 2] = arrange[o] * 4 + 2;
491 ((uint8_t *) t)[o * 4 + 3] = arrange[o] * 4 + 3;
492 } else {
493 /* use invalid table indices to map to 0 */
494 ((uint8_t *) t)[o * 4 + 0] = 0xff;
495 ((uint8_t *) t)[o * 4 + 1] = 0xff;
496 ((uint8_t *) t)[o * 4 + 2] = 0xff;
497 ((uint8_t *) t)[o * 4 + 3] = 0xff;
498 }
499 }
500 break;
501 }
502 default:
503 pa_assert_not_reached();
504 }
505 } else if (n_ic == 4 && n_oc == 4) {
506 unsigned i, o;
507
508 if (m->format == PA_SAMPLE_S32NE)
509 return;
510 pa_log_info("Using ARM NEON 4-channel remapping");
511 pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_s16ne_neon,
512 (pa_do_remap_func_t) NULL,
513 (pa_do_remap_func_t) remap_ch4_float32ne_neon);
514
515 /* setup state */
516 switch (m->format) {
517 case PA_SAMPLE_S16NE: {
518 int32x4_t *f = m->state = pa_xnew0(int32x4_t, 4);
519 for (o = 0; o < 4; o++) {
520 for (i = 0; i < 4; i++) {
521 ((int *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_i[o][i], 0, 0x10000);
522 }
523 }
524 break;
525 }
526 case PA_SAMPLE_FLOAT32NE: {
527 float32x4_t *f = m->state = pa_xnew0(float32x4_t, 4);
528 for (o = 0; o < 4; o++) {
529 for (i = 0; i < 4; i++) {
530 ((float *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_f[o][i], 0.0f, 1.0f);
531 }
532 }
533 break;
534 }
535 default:
536 pa_assert_not_reached();
537 }
538 }
539 }
540
pa_remap_func_init_neon(pa_cpu_arm_flag_t flags)541 void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) {
542 pa_log_info("Initialising ARM NEON optimized remappers.");
543 arm_flags = flags;
544 pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon);
545 }
546