• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /***
2   This file is part of PulseAudio.
3 
4   Copyright 2013 Peter Meerwald <p.meerwald@bct-electronic.com>
5 
6   PulseAudio is free software; you can redistribute it and/or modify
7   it under the terms of the GNU Lesser General Public License as published
8   by the Free Software Foundation; either version 2.1 of the License,
9   or (at your option) any later version.
10 
11   PulseAudio is distributed in the hope that it will be useful, but
12   WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14   General Public License for more details.
15 ***/
16 
17 #ifdef HAVE_CONFIG_H
18 #include <config.h>
19 #endif
20 
21 #include <pulse/sample.h>
22 #include <pulse/xmalloc.h>
23 #include <pulsecore/log.h>
24 #include <pulsecore/macro.h>
25 
26 #include "cpu-arm.h"
27 #include "remap.h"
28 
29 #include <arm_neon.h>
30 
remap_mono_to_stereo_float32ne_neon_a8(pa_remap_t * m,float * dst,const float * src,unsigned n)31 static void remap_mono_to_stereo_float32ne_neon_a8(pa_remap_t *m, float *dst, const float *src, unsigned n) {
32     for (; n >= 4; n -= 4) {
33         __asm__ __volatile__ (
34             "vld1.32    {q0}, [%[src]]!         \n\t"
35             "vmov       q1, q0                  \n\t"
36             "vst2.32    {q0,q1}, [%[dst]]!      \n\t"
37             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
38             : /* input operands */
39             : "memory", "q0", "q1" /* clobber list */
40         );
41     }
42 
43     for (; n > 0; n--) {
44         dst[0] = dst[1] = src[0];
45         src++;
46         dst += 2;
47     }
48 }
49 
remap_mono_to_stereo_float32ne_generic_arm(pa_remap_t * m,float * dst,const float * src,unsigned n)50 static void remap_mono_to_stereo_float32ne_generic_arm(pa_remap_t *m, float *dst, const float *src, unsigned n) {
51     for (; n >= 2; n -= 2) {
52         __asm__ __volatile__ (
53             "ldm        %[src]!, {r4,r6}        \n\t"
54             "mov        r5, r4                  \n\t"
55 
56             /* We use r12 instead of r7 here, because r7 is reserved for the
57              * frame pointer when using Thumb. */
58             "mov        r12, r6                 \n\t"
59 
60             "stm        %[dst]!, {r4-r6,r12}    \n\t"
61             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
62             : /* input operands */
63             : "memory", "r4", "r5", "r6", "r12" /* clobber list */
64         );
65     }
66 
67     if (n > 0)
68         dst[0] = dst[1] = src[0];
69 }
70 
remap_mono_to_stereo_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)71 static void remap_mono_to_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
72     for (; n >= 8; n -= 8) {
73         __asm__ __volatile__ (
74             "vld1.16    {q0}, [%[src]]!         \n\t"
75             "vmov       q1, q0                  \n\t"
76             "vst2.16    {q0,q1}, [%[dst]]!      \n\t"
77             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
78             : /* input operands */
79             : "memory", "q0", "q1" /* clobber list */
80         );
81     }
82 
83     for (; n > 0; n--) {
84         dst[0] = dst[1] = src[0];
85         src++;
86         dst += 2;
87     }
88 }
89 
remap_mono_to_ch4_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)90 static void remap_mono_to_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
91     for (; n >= 2; n -= 2) {
92         __asm__ __volatile__ (
93             "vld1.32    {d0}, [%[src]]!         \n\t"
94             "vdup.f32   q1, d0[0]               \n\t"
95             "vdup.f32   q2, d0[1]               \n\t"
96             "vst1.32    {q1,q2}, [%[dst]]!      \n\t"
97             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
98             : /* input operands */
99             : "memory", "q0", "q1", "q2" /* clobber list */
100         );
101     }
102 
103     if (n--)
104         dst[0] = dst[1] = dst[2] = dst[3] = src[0];
105 }
106 
remap_mono_to_ch4_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)107 static void remap_mono_to_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
108     for (; n >= 4; n -= 4) {
109         __asm__ __volatile__ (
110             "vld1.16    {d0}, [%[src]]!         \n\t"
111             "vdup.s16   d1, d0[1]               \n\t"
112             "vdup.s16   d2, d0[2]               \n\t"
113             "vdup.s16   d3, d0[3]               \n\t"
114             "vdup.s16   d0, d0[0]               \n\t"
115             "vst1.16    {d0,d1,d2,d3}, [%[dst]]!\n\t"
116             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
117             : /* input operands */
118             : "memory", "d0", "d1", "d2", "d3" /* clobber list */
119         );
120     }
121 
122     for (; n > 0; n--) {
123         dst[0] = dst[1] = dst[2] = dst[3] = src[0];
124         src++;
125         dst += 4;
126     }
127 }
128 
remap_stereo_to_mono_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)129 static void remap_stereo_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
130     const float32x4_t halve = vdupq_n_f32(0.5f);
131     for (; n >= 4; n -= 4) {
132         __asm__ __volatile__ (
133             "vld2.32    {q0,q1}, [%[src]]!      \n\t"
134             "vadd.f32   q0, q0, q1              \n\t"
135             "vmul.f32   q0, q0, %q[halve]       \n\t"
136             "vst1.32    {q0}, [%[dst]]!         \n\t"
137             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
138             : [halve] "w" (halve) /* input operands */
139             : "memory", "q0", "q1" /* clobber list */
140         );
141     }
142 
143     for (; n > 0; n--) {
144         dst[0] = (src[0] + src[1])*0.5f;
145         src += 2;
146         dst++;
147     }
148 }
149 
remap_stereo_to_mono_s32ne_neon(pa_remap_t * m,int32_t * dst,const int32_t * src,unsigned n)150 static void remap_stereo_to_mono_s32ne_neon(pa_remap_t *m, int32_t *dst, const int32_t *src, unsigned n) {
151     for (; n >= 4; n -= 4) {
152         __asm__ __volatile__ (
153             "vld2.32    {q0,q1}, [%[src]]!      \n\t"
154             "vrhadd.s32 q0, q0, q1              \n\t"
155             "vst1.32    {q0}, [%[dst]]!         \n\t"
156             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
157             : /* input operands */
158             : "memory", "q0", "q1" /* clobber list */
159         );
160     }
161 
162     for (; n > 0; n--) {
163         dst[0] = src[0]/2 + src[1]/2;
164         src += 2;
165         dst++;
166     }
167 }
168 
remap_stereo_to_mono_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)169 static void remap_stereo_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
170     for (; n >= 8; n -= 8) {
171         __asm__ __volatile__ (
172             "vld2.16    {q0,q1}, [%[src]]!      \n\t"
173             "vrhadd.s16 q0, q0, q1              \n\t"
174             "vst1.16    {q0}, [%[dst]]!         \n\t"
175             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
176             : /* input operands */
177             : "memory", "q0", "q1" /* clobber list */
178         );
179     }
180 
181     for (; n > 0; n--) {
182         dst[0] = (src[0] + src[1])/2;
183         src += 2;
184         dst++;
185     }
186 }
187 
remap_ch4_to_mono_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)188 static void remap_ch4_to_mono_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
189     const float32x2_t quart = vdup_n_f32(0.25f);
190     for (; n >= 2; n -= 2) {
191         __asm__ __volatile__ (
192             "vld4.32    {d0,d1,d2,d3}, [%[src]]!\n\t"
193             "vadd.f32   d0, d0, d1              \n\t"
194             "vadd.f32   d2, d2, d3              \n\t"
195             "vadd.f32   d0, d0, d2              \n\t"
196             "vmul.f32   d0, d0, %P[quart]       \n\t"
197             "vst1.32    {d0}, [%[dst]]!         \n\t"
198             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
199             : [quart] "w" (quart) /* input operands */
200             : "memory", "d0", "d1", "d2", "d3" /* clobber list */
201         );
202     }
203 
204     if (n > 0)
205         dst[0] = (src[0] + src[1] + src[2] + src[3])*0.25f;
206 }
207 
remap_ch4_to_mono_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)208 static void remap_ch4_to_mono_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
209     for (; n >= 4; n -= 4) {
210         __asm__ __volatile__ (
211             "vld4.16    {d0,d1,d2,d3}, [%[src]]!\n\t"
212             "vrhadd.s16 d0, d0, d1              \n\t"
213             "vrhadd.s16 d2, d2, d3              \n\t"
214             "vrhadd.s16 d0, d0, d2              \n\t"
215             "vst1.16    {d0}, [%[dst]]!         \n\t"
216             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
217             : /* input operands */
218             : "memory", "d0", "d1", "d2", "d3" /* clobber list */
219         );
220     }
221 
222     for (; n > 0; n--) {
223         dst[0] = (src[0] + src[1] + src[2] + src[3])/4;
224         src += 4;
225         dst++;
226     }
227 }
228 
remap_ch4_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)229 static void remap_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
230     int32x4_t *f = m->state;
231     const int32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
232 
233     for (; n > 0; n--) {
234         __asm__ __volatile__ (
235             "vld1.16    {d0}, [%[src]]!         \n\t"
236             "vmovl.s16  q0, d0                  \n\t"
237             "vdup.s32   q1, d0[0]               \n\t"
238             "vmul.s32   q1, q1, %q[f0]          \n\t"
239             "vdup.s32   q2, d0[1]               \n\t"
240             "vmla.s32   q1, q2, %q[f1]          \n\t"
241             "vdup.s32   q2, d1[0]               \n\t"
242             "vmla.s32   q1, q2, %q[f2]          \n\t"
243             "vdup.s32   q2, d1[1]               \n\t"
244             "vmla.s32   q1, q2, %q[f3]          \n\t"
245             "vqshrn.s32  d2, q1, #16            \n\t"
246             "vst1.32    {d2}, [%[dst]]!         \n\t"
247             : [dst] "+r" (dst), [src] "+r" (src)
248             : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
249             : "memory", "q0", "q1", "q2"
250         );
251     }
252 }
253 
remap_ch4_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)254 static void remap_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
255     float32x4_t *f = m->state;
256     const float32x4_t f0 = f[0], f1 = f[1], f2 = f[2], f3 = f[3];
257 
258     for (; n > 0; n--) {
259         __asm__ __volatile__ (
260             "vld1.32    {d0,d1}, [%[src]]!      \n\t"
261             "vdup.f32   q1, d0[0]               \n\t"
262             "vmul.f32   q1, q1, %q[f0]          \n\t"
263             "vdup.f32   q2, d0[1]               \n\t"
264             "vmla.f32   q1, q2, %q[f1]          \n\t"
265             "vdup.f32   q2, d1[0]               \n\t"
266             "vmla.f32   q1, q2, %q[f2]          \n\t"
267             "vdup.f32   q2, d1[1]               \n\t"
268             "vmla.f32   q1, q2, %q[f3]          \n\t"
269             "vst1.32    {d2,d3}, [%[dst]]!      \n\t"
270             : [dst] "+r" (dst), [src] "+r" (src)
271             : [f0] "w" (f0), [f1] "w" (f1), [f2] "w" (f2), [f3] "w" (f3)
272             : "memory", "q0", "q1", "q2"
273         );
274     }
275 }
276 
remap_arrange_stereo_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)277 static void remap_arrange_stereo_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
278     const uint8x8_t t = ((uint8x8_t *) m->state)[0];
279 
280     for (; n >= 2; n -= 2) {
281         __asm__ __volatile__ (
282             "vld1.s16   d0, [%[src]]!           \n\t"
283             "vtbl.8     d0, {d0}, %P[t]         \n\t"
284             "vst1.s16   d0, [%[dst]]!           \n\t"
285             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
286             : [t] "w" (t) /* input operands */
287             : "memory", "d0" /* clobber list */
288         );
289     }
290 
291     if (n > 0) {
292         __asm__ __volatile__ (
293             "vld1.32   d0[0], [%[src]]!         \n\t"
294             "vtbl.8    d0, {d0}, %P[t]          \n\t"
295             "vst1.32   d0[0], [%[dst]]!         \n\t"
296             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
297             : [t] "w" (t) /* input operands */
298             : "memory", "d0" /* clobber list */
299         );
300     }
301 }
302 
remap_arrange_ch2_ch4_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)303 static void remap_arrange_ch2_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
304     const uint8x8_t t = ((uint8x8_t *) m->state)[0];
305 
306     for (; n > 0; n--) {
307         __asm__ __volatile__ (
308             "vld1.32    d0[0], [%[src]]!           \n\t"
309             "vtbl.8     d0, {d0}, %P[t]            \n\t"
310             "vst1.s16   d0, [%[dst]]!              \n\t"
311             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
312             : [t] "w" (t) /* input operands */
313             : "memory", "d0" /* clobber list */
314         );
315     }
316 }
317 
remap_arrange_ch4_s16ne_neon(pa_remap_t * m,int16_t * dst,const int16_t * src,unsigned n)318 static void remap_arrange_ch4_s16ne_neon(pa_remap_t *m, int16_t *dst, const int16_t *src, unsigned n) {
319     const uint8x8_t t = ((uint8x8_t *) m->state)[0];
320 
321     for (; n > 0; n--) {
322         __asm__ __volatile__ (
323             "vld1.s16   d0, [%[src]]!           \n\t"
324             "vtbl.8     d0, {d0}, %P[t]         \n\t"
325             "vst1.s16   d0, [%[dst]]!           \n\t"
326             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
327             : [t] "w" (t) /* input operands */
328             : "memory", "d0" /* clobber list */
329         );
330     }
331 }
332 
remap_arrange_stereo_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)333 static void remap_arrange_stereo_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
334     const uint8x8_t t = ((uint8x8_t *)m->state)[0];
335 
336     for (; n > 0; n--) {
337         __asm__ __volatile__ (
338             "vld1.f32   d0, [%[src]]!           \n\t"
339             "vtbl.8     d0, {d0}, %P[t]         \n\t"
340             "vst1.s16   {d0}, [%[dst]]!         \n\t"
341             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
342             : [t] "w" (t) /* input operands */
343             : "memory", "d0" /* clobber list */
344         );
345     }
346 }
347 
348 /* Works for both S32NE and FLOAT32NE */
remap_arrange_ch2_ch4_any32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)349 static void remap_arrange_ch2_ch4_any32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
350     const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
351     const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
352 
353     for (; n > 0; n--) {
354         __asm__ __volatile__ (
355             "vld1.f32   d0, [%[src]]!           \n\t"
356             "vtbl.8     d1, {d0}, %P[t0]        \n\t"
357             "vtbl.8     d2, {d0}, %P[t1]        \n\t"
358             "vst1.s16   {d1,d2}, [%[dst]]!      \n\t"
359             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
360             : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
361             : "memory", "d0", "d1", "d2" /* clobber list */
362         );
363     }
364 }
365 
remap_arrange_ch4_float32ne_neon(pa_remap_t * m,float * dst,const float * src,unsigned n)366 static void remap_arrange_ch4_float32ne_neon(pa_remap_t *m, float *dst, const float *src, unsigned n) {
367     const uint8x8_t t0 = ((uint8x8_t *)m->state)[0];
368     const uint8x8_t t1 = ((uint8x8_t *)m->state)[1];
369 
370     for (; n > 0; n--) {
371         __asm__ __volatile__ (
372             "vld1.f32   {d0,d1}, [%[src]]!      \n\t"
373             "vtbl.8     d2, {d0,d1}, %P[t0]     \n\t"
374             "vtbl.8     d3, {d0,d1}, %P[t1]     \n\t"
375             "vst1.s16   {d2,d3}, [%[dst]]!      \n\t"
376             : [dst] "+r" (dst), [src] "+r" (src) /* output operands */
377             : [t0] "w" (t0), [t1] "w" (t1) /* input operands */
378             : "memory", "d0", "d1", "d2", "d3" /* clobber list */
379         );
380     }
381 }
382 
383 static pa_cpu_arm_flag_t arm_flags;
384 
init_remap_neon(pa_remap_t * m)385 static void init_remap_neon(pa_remap_t *m) {
386     unsigned n_oc, n_ic;
387     int8_t arrange[PA_CHANNELS_MAX];
388 
389     n_oc = m->o_ss.channels;
390     n_ic = m->i_ss.channels;
391 
392     /* We short-circuit remap function selection for S32NE in most
393      * cases as the corresponding generic C code is performing
394      * similarly or even better. However there are a few cases where
395      * there actually is a significant improvement from using
396      * hand-crafted NEON assembly so we cannot just bail out for S32NE
397      * here. */
398     if (n_ic == 1 && n_oc == 2 &&
399             m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000) {
400         if (m->format == PA_SAMPLE_S32NE)
401             return;
402         if (arm_flags & PA_CPU_ARM_CORTEX_A8) {
403 
404             pa_log_info("Using ARM NEON/A8 mono to stereo remapping");
405             pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
406                 NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_neon_a8);
407         }
408         else {
409             pa_log_info("Using ARM NEON mono to stereo remapping");
410             pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_stereo_s16ne_neon,
411                 NULL, (pa_do_remap_func_t) remap_mono_to_stereo_float32ne_generic_arm);
412         }
413     } else if (n_ic == 1 && n_oc == 4 &&
414             m->map_table_i[0][0] == 0x10000 && m->map_table_i[1][0] == 0x10000 &&
415             m->map_table_i[2][0] == 0x10000 && m->map_table_i[3][0] == 0x10000) {
416 
417         if (m->format == PA_SAMPLE_S32NE)
418             return;
419         pa_log_info("Using ARM NEON mono to 4-channel remapping");
420         pa_set_remap_func(m, (pa_do_remap_func_t) remap_mono_to_ch4_s16ne_neon,
421             NULL, (pa_do_remap_func_t) remap_mono_to_ch4_float32ne_neon);
422     } else if (n_ic == 2 && n_oc == 1 &&
423             m->map_table_i[0][0] == 0x8000 && m->map_table_i[0][1] == 0x8000) {
424 
425         pa_log_info("Using ARM NEON stereo to mono remapping");
426         pa_set_remap_func(m, (pa_do_remap_func_t) remap_stereo_to_mono_s16ne_neon,
427             (pa_do_remap_func_t) remap_stereo_to_mono_s32ne_neon,
428             (pa_do_remap_func_t) remap_stereo_to_mono_float32ne_neon);
429     } else if (n_ic == 4 && n_oc == 1 &&
430             m->map_table_i[0][0] == 0x4000 && m->map_table_i[0][1] == 0x4000 &&
431             m->map_table_i[0][2] == 0x4000 && m->map_table_i[0][3] == 0x4000) {
432 
433         if (m->format == PA_SAMPLE_S32NE)
434             return;
435         pa_log_info("Using ARM NEON 4-channel to mono remapping");
436         pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_to_mono_s16ne_neon,
437             NULL, (pa_do_remap_func_t) remap_ch4_to_mono_float32ne_neon);
438     } else if (pa_setup_remap_arrange(m, arrange) &&
439         ((n_ic == 2 && n_oc == 2) ||
440          (n_ic == 2 && n_oc == 4) ||
441          (n_ic == 4 && n_oc == 4))) {
442         unsigned o;
443 
444         if (n_ic == 2 && n_oc == 2) {
445             if (m->format == PA_SAMPLE_S32NE)
446                 return;
447             pa_log_info("Using NEON stereo arrange remapping");
448             pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_stereo_s16ne_neon,
449                 NULL, (pa_do_remap_func_t) remap_arrange_stereo_float32ne_neon);
450         } else if (n_ic == 2 && n_oc == 4) {
451             pa_log_info("Using NEON 2-channel to 4-channel arrange remapping");
452             pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch2_ch4_s16ne_neon,
453                 (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon,
454                 (pa_do_remap_func_t) remap_arrange_ch2_ch4_any32ne_neon);
455         } else if (n_ic == 4 && n_oc == 4) {
456             if (m->format == PA_SAMPLE_S32NE)
457                 return;
458             pa_log_info("Using NEON 4-channel arrange remapping");
459             pa_set_remap_func(m, (pa_do_remap_func_t) remap_arrange_ch4_s16ne_neon,
460                 NULL, (pa_do_remap_func_t) remap_arrange_ch4_float32ne_neon);
461         }
462 
463         /* setup state */
464         switch (m->format) {
465         case PA_SAMPLE_S16NE: {
466             uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 1);
467             for (o = 0; o < 4; o++) {
468                 if (arrange[o % n_oc] >= 0) {
469                     /* convert channel index to vtbl indices */
470                     unsigned frame = o / n_oc;
471                     ((uint8_t *) t)[o * 2 + 0] = (frame * n_oc + arrange[o % n_oc]) * 2 + 0;
472                     ((uint8_t *) t)[o * 2 + 1] = (frame * n_oc + arrange[o % n_oc]) * 2 + 1;
473                 } else {
474                     /* use invalid table indices to map to 0 */
475                     ((uint8_t *) t)[o * 2 + 0] = 0xff;
476                     ((uint8_t *) t)[o * 2 + 1] = 0xff;
477                 }
478             }
479             break;
480         }
481         case PA_SAMPLE_S32NE:
482                 /* fall-through */
483         case PA_SAMPLE_FLOAT32NE: {
484             uint8x8_t *t = m->state = pa_xnew0(uint8x8_t, 2);
485             for (o = 0; o < n_oc; o++) {
486                 if (arrange[o] >= 0) {
487                     /* convert channel index to vtbl indices */
488                     ((uint8_t *) t)[o * 4 + 0] = arrange[o] * 4 + 0;
489                     ((uint8_t *) t)[o * 4 + 1] = arrange[o] * 4 + 1;
490                     ((uint8_t *) t)[o * 4 + 2] = arrange[o] * 4 + 2;
491                     ((uint8_t *) t)[o * 4 + 3] = arrange[o] * 4 + 3;
492                 } else {
493                     /* use invalid table indices to map to 0 */
494                     ((uint8_t *) t)[o * 4 + 0] = 0xff;
495                     ((uint8_t *) t)[o * 4 + 1] = 0xff;
496                     ((uint8_t *) t)[o * 4 + 2] = 0xff;
497                     ((uint8_t *) t)[o * 4 + 3] = 0xff;
498                 }
499             }
500             break;
501         }
502         default:
503             pa_assert_not_reached();
504         }
505     } else if (n_ic == 4 && n_oc == 4) {
506         unsigned i, o;
507 
508         if (m->format == PA_SAMPLE_S32NE)
509             return;
510         pa_log_info("Using ARM NEON 4-channel remapping");
511         pa_set_remap_func(m, (pa_do_remap_func_t) remap_ch4_s16ne_neon,
512             (pa_do_remap_func_t) NULL,
513             (pa_do_remap_func_t) remap_ch4_float32ne_neon);
514 
515         /* setup state */
516         switch (m->format) {
517         case PA_SAMPLE_S16NE: {
518             int32x4_t *f = m->state = pa_xnew0(int32x4_t, 4);
519             for (o = 0; o < 4; o++) {
520                 for (i = 0; i < 4; i++) {
521                     ((int *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_i[o][i], 0, 0x10000);
522                 }
523             }
524             break;
525         }
526         case PA_SAMPLE_FLOAT32NE: {
527             float32x4_t *f = m->state = pa_xnew0(float32x4_t, 4);
528             for (o = 0; o < 4; o++) {
529                 for (i = 0; i < 4; i++) {
530                     ((float *) &f[i])[o] = PA_CLAMP_UNLIKELY(m->map_table_f[o][i], 0.0f, 1.0f);
531                 }
532             }
533             break;
534         }
535         default:
536             pa_assert_not_reached();
537         }
538     }
539 }
540 
pa_remap_func_init_neon(pa_cpu_arm_flag_t flags)541 void pa_remap_func_init_neon(pa_cpu_arm_flag_t flags) {
542     pa_log_info("Initialising ARM NEON optimized remappers.");
543     arm_flags = flags;
544     pa_set_init_remap_func((pa_init_remap_func_t) init_remap_neon);
545 }
546