1 /* Copyright 2013 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6 #include "dsp_util.h"
7
8 #ifndef max
9 #define max(a, b) ({ __typeof__(a) _a = (a); \
10 __typeof__(b) _b = (b); \
11 _a > _b ? _a : _b; })
12 #endif
13
14 #ifndef min
15 #define min(a, b) ({ __typeof__(a) _a = (a); \
16 __typeof__(b) _b = (b); \
17 _a < _b ? _a : _b; })
18 #endif
19
20 #undef deinterleave_stereo
21 #undef interleave_stereo
22
23 /* Converts shorts in range of -32768 to 32767 to floats in range of
24 * -1.0f to 1.0f.
25 * scvtf instruction accepts fixed point ints, so sxtl is used to lengthen
26 * shorts to int with sign extension.
27 */
28 #ifdef __aarch64__
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)29 static void deinterleave_stereo(int16_t *input, float *output1,
30 float *output2, int frames)
31 {
32 int chunk = frames >> 3;
33 frames &= 7;
34 /* Process 8 frames (16 samples) each loop. */
35 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
36 if (chunk) {
37 __asm__ __volatile__ (
38 "1: \n"
39 "ld2 {v2.8h, v3.8h}, [%[input]], #32 \n"
40 "subs %w[chunk], %w[chunk], #1 \n"
41 "sxtl v0.4s, v2.4h \n"
42 "sxtl2 v1.4s, v2.8h \n"
43 "sxtl v2.4s, v3.4h \n"
44 "sxtl2 v3.4s, v3.8h \n"
45 "scvtf v0.4s, v0.4s, #15 \n"
46 "scvtf v1.4s, v1.4s, #15 \n"
47 "scvtf v2.4s, v2.4s, #15 \n"
48 "scvtf v3.4s, v3.4s, #15 \n"
49 "st1 {v0.4s, v1.4s}, [%[output1]], #32 \n"
50 "st1 {v2.4s, v3.4s}, [%[output2]], #32 \n"
51 "b.ne 1b \n"
52 : /* output */
53 [chunk]"+r"(chunk),
54 [input]"+r"(input),
55 [output1]"+r"(output1),
56 [output2]"+r"(output2)
57 : /* input */
58 : /* clobber */
59 "v0", "v1", "v2", "v3", "memory", "cc"
60 );
61 }
62
63 /* The remaining samples. */
64 while (frames--) {
65 *output1++ = *input++ / 32768.0f;
66 *output2++ = *input++ / 32768.0f;
67 }
68 }
69 #define deinterleave_stereo deinterleave_stereo
70
71 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
72 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
73 * from zero.
74 * Rounding is achieved by using fcvtas instruction. (a = away)
75 * The float scaled to a range of -32768 to 32767 by adding 15 to the exponent.
76 * Add to exponent is equivalent to multiply for exponent range of 0 to 239,
77 * which is 2.59 * 10^33. A signed saturating add (sqadd) limits exponents
78 * from 240 to 255 to clamp to 255.
79 * For very large values, beyond +/- 2 billion, fcvtas will clamp the result
80 * to the min or max value that fits an int.
81 * For other values, sqxtn clamps the output to -32768 to 32767 range.
82 */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)83 static void interleave_stereo(float *input1, float *input2,
84 int16_t *output, int frames)
85 {
86 /* Process 4 frames (8 samples) each loop. */
87 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
88 int chunk = frames >> 2;
89 frames &= 3;
90
91 if (chunk) {
92 __asm__ __volatile__ (
93 "dup v2.4s, %w[scale] \n"
94 "1: \n"
95 "ld1 {v0.4s}, [%[input1]], #16 \n"
96 "ld1 {v1.4s}, [%[input2]], #16 \n"
97 "subs %w[chunk], %w[chunk], #1 \n"
98 "sqadd v0.4s, v0.4s, v2.4s \n"
99 "sqadd v1.4s, v1.4s, v2.4s \n"
100 "fcvtas v0.4s, v0.4s \n"
101 "fcvtas v1.4s, v1.4s \n"
102 "sqxtn v0.4h, v0.4s \n"
103 "sqxtn v1.4h, v1.4s \n"
104 "st2 {v0.4h, v1.4h}, [%[output]], #16 \n"
105 "b.ne 1b \n"
106 : /* output */
107 [chunk]"+r"(chunk),
108 [input1]"+r"(input1),
109 [input2]"+r"(input2),
110 [output]"+r"(output)
111 : /* input */
112 [scale]"r"(15 << 23)
113 : /* clobber */
114 "v0", "v1", "v2", "memory", "cc"
115 );
116 }
117
118 /* The remaining samples */
119 while (frames--) {
120 float f;
121 f = *input1++ * 32768.0f;
122 f += (f >= 0) ? 0.5f : -0.5f;
123 *output++ = max(-32768, min(32767, (int)(f)));
124 f = *input2++ * 32768.0f;
125 f += (f >= 0) ? 0.5f : -0.5f;
126 *output++ = max(-32768, min(32767, (int)(f)));
127 }
128 }
129 #define interleave_stereo interleave_stereo
130 #endif
131
132 #ifdef __ARM_NEON__
133 #include <arm_neon.h>
134
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)135 static void deinterleave_stereo(int16_t *input, float *output1,
136 float *output2, int frames)
137 {
138 /* Process 8 frames (16 samples) each loop. */
139 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
140 int chunk = frames >> 3;
141 frames &= 7;
142 if (chunk) {
143 __asm__ __volatile__ (
144 "1: \n"
145 "vld2.16 {d0-d3}, [%[input]]! \n"
146 "subs %[chunk], #1 \n"
147 "vmovl.s16 q3, d3 \n"
148 "vmovl.s16 q2, d2 \n"
149 "vmovl.s16 q1, d1 \n"
150 "vmovl.s16 q0, d0 \n"
151 "vcvt.f32.s32 q3, q3, #15 \n"
152 "vcvt.f32.s32 q2, q2, #15 \n"
153 "vcvt.f32.s32 q1, q1, #15 \n"
154 "vcvt.f32.s32 q0, q0, #15 \n"
155 "vst1.32 {d4-d7}, [%[output2]]! \n"
156 "vst1.32 {d0-d3}, [%[output1]]! \n"
157 "bne 1b \n"
158 : /* output */
159 [chunk]"+r"(chunk),
160 [input]"+r"(input),
161 [output1]"+r"(output1),
162 [output2]"+r"(output2)
163 : /* input */
164 : /* clobber */
165 "q0", "q1", "q2", "q3", "memory", "cc"
166 );
167 }
168
169 /* The remaining samples. */
170 while (frames--) {
171 *output1++ = *input++ / 32768.0f;
172 *output2++ = *input++ / 32768.0f;
173 }
174 }
175 #define deinterleave_stereo deinterleave_stereo
176
177 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
178 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
179 * from zero.
180 * Rounding is achieved by adding 0.5 or -0.5 adjusted for fixed point
181 * precision, and then converting float to fixed point using vcvt instruction
182 * which truncated toward zero.
183 * For very large values, beyond +/- 2 billion, vcvt will clamp the result
184 * to the min or max value that fits an int.
185 * For other values, vqmovn clamps the output to -32768 to 32767 range.
186 */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)187 static void interleave_stereo(float *input1, float *input2,
188 int16_t *output, int frames)
189 {
190 /* Process 4 frames (8 samples) each loop. */
191 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
192 float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
193 float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
194 int chunk = frames >> 2;
195 frames &= 3;
196
197 if (chunk) {
198 __asm__ __volatile__ (
199 "veor q0, q0, q0 \n"
200 "1: \n"
201 "vld1.32 {d2-d3}, [%[input1]]! \n"
202 "vld1.32 {d4-d5}, [%[input2]]! \n"
203 "subs %[chunk], #1 \n"
204 /* We try to round to the nearest number by adding 0.5
205 * to positive input, and adding -0.5 to the negative
206 * input, then truncate.
207 */
208 "vcgt.f32 q3, q1, q0 \n"
209 "vcgt.f32 q4, q2, q0 \n"
210 "vbsl q3, %q[pos], %q[neg] \n"
211 "vbsl q4, %q[pos], %q[neg] \n"
212 "vadd.f32 q1, q1, q3 \n"
213 "vadd.f32 q2, q2, q4 \n"
214 "vcvt.s32.f32 q1, q1, #15 \n"
215 "vcvt.s32.f32 q2, q2, #15 \n"
216 "vqmovn.s32 d2, q1 \n"
217 "vqmovn.s32 d3, q2 \n"
218 "vst2.16 {d2-d3}, [%[output]]! \n"
219 "bne 1b \n"
220 : /* output */
221 [chunk]"+r"(chunk),
222 [input1]"+r"(input1),
223 [input2]"+r"(input2),
224 [output]"+r"(output)
225 : /* input */
226 [pos]"w"(pos),
227 [neg]"w"(neg)
228 : /* clobber */
229 "q0", "q1", "q2", "q3", "q4", "memory", "cc"
230 );
231 }
232
233 /* The remaining samples */
234 while (frames--) {
235 float f;
236 f = *input1++ * 32768.0f;
237 f += (f >= 0) ? 0.5f : -0.5f;
238 *output++ = max(-32768, min(32767, (int)(f)));
239 f = *input2++ * 32768.0f;
240 f += (f >= 0) ? 0.5f : -0.5f;
241 *output++ = max(-32768, min(32767, (int)(f)));
242 }
243 }
244 #define interleave_stereo interleave_stereo
245 #endif
246
247 #ifdef __SSE3__
248 #include <emmintrin.h>
249
250 /* Converts shorts in range of -32768 to 32767 to floats in range of
251 * -1.0f to 1.0f.
252 * pslld and psrad shifts are used to isolate the low and high word, but
253 * each in a different range:
254 * The low word is shifted to the high bits in range 0x80000000 .. 0x7fff0000.
255 * The high word is shifted to the low bits in range 0x00008000 .. 0x00007fff.
256 * cvtdq2ps converts ints to floats as is.
257 * mulps is used to normalize the range of the low and high words, adjusting
258 * for high and low words being in different range.
259 */
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)260 static void deinterleave_stereo(int16_t *input, float *output1,
261 float *output2, int frames)
262 {
263 /* Process 8 frames (16 samples) each loop. */
264 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
265 int chunk = frames >> 3;
266 frames &= 7;
267 if (chunk) {
268 __asm__ __volatile__ (
269 "1: \n"
270 "lddqu (%[input]), %%xmm0 \n"
271 "lddqu 16(%[input]), %%xmm1 \n"
272 "add $32, %[input] \n"
273 "movdqa %%xmm0, %%xmm2 \n"
274 "movdqa %%xmm1, %%xmm3 \n"
275 "pslld $16, %%xmm0 \n"
276 "pslld $16, %%xmm1 \n"
277 "psrad $16, %%xmm2 \n"
278 "psrad $16, %%xmm3 \n"
279 "cvtdq2ps %%xmm0, %%xmm0 \n"
280 "cvtdq2ps %%xmm1, %%xmm1 \n"
281 "cvtdq2ps %%xmm2, %%xmm2 \n"
282 "cvtdq2ps %%xmm3, %%xmm3 \n"
283 "mulps %[scale_2_n31], %%xmm0 \n"
284 "mulps %[scale_2_n31], %%xmm1 \n"
285 "mulps %[scale_2_n15], %%xmm2 \n"
286 "mulps %[scale_2_n15], %%xmm3 \n"
287 "movdqu %%xmm0, (%[output1]) \n"
288 "movdqu %%xmm1, 16(%[output1]) \n"
289 "movdqu %%xmm2, (%[output2]) \n"
290 "movdqu %%xmm3, 16(%[output2]) \n"
291 "add $32, %[output1] \n"
292 "add $32, %[output2] \n"
293 "sub $1, %[chunk] \n"
294 "jnz 1b \n"
295 : /* output */
296 [chunk]"+r"(chunk),
297 [input]"+r"(input),
298 [output1]"+r"(output1),
299 [output2]"+r"(output2)
300 : /* input */
301 [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
302 [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
303 : /* clobber */
304 "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc"
305 );
306 }
307
308 /* The remaining samples. */
309 while (frames--) {
310 *output1++ = *input++ / 32768.0f;
311 *output2++ = *input++ / 32768.0f;
312 }
313 }
314 #define deinterleave_stereo deinterleave_stereo
315
316 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
317 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding to
318 * even.
319 * For very large values, beyond +/- 2 billion, cvtps2dq will produce
320 * 0x80000000 and packssdw will clamp -32768.
321 */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)322 static void interleave_stereo(float *input1, float *input2,
323 int16_t *output, int frames)
324 {
325 /* Process 4 frames (8 samples) each loop. */
326 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
327 int chunk = frames >> 2;
328 frames &= 3;
329
330 if (chunk) {
331 __asm__ __volatile__ (
332 "1: \n"
333 "lddqu (%[input1]), %%xmm0 \n"
334 "lddqu (%[input2]), %%xmm2 \n"
335 "add $16, %[input1] \n"
336 "add $16, %[input2] \n"
337 "movaps %%xmm0, %%xmm1 \n"
338 "unpcklps %%xmm2, %%xmm0 \n"
339 "unpckhps %%xmm2, %%xmm1 \n"
340 "paddsw %[scale_2_15], %%xmm0 \n"
341 "paddsw %[scale_2_15], %%xmm1 \n"
342 "cvtps2dq %%xmm0, %%xmm0 \n"
343 "cvtps2dq %%xmm1, %%xmm1 \n"
344 "packssdw %%xmm1, %%xmm0 \n"
345 "movdqu %%xmm0, (%[output]) \n"
346 "add $16, %[output] \n"
347 "sub $1, %[chunk] \n"
348 "jnz 1b \n"
349 : /* output */
350 [chunk]"+r"(chunk),
351 [input1]"+r"(input1),
352 [input2]"+r"(input2),
353 [output]"+r"(output)
354 : /* input */
355 [scale_2_15]"x"(_mm_set1_epi32(15 << 23)),
356 [clamp_large]"x"(_mm_set1_ps(32767.0f))
357 : /* clobber */
358 "xmm0", "xmm1", "xmm2", "memory", "cc"
359 );
360 }
361
362 /* The remaining samples */
363 while (frames--) {
364 float f;
365 f = *input1++ * 32768.0f;
366 f += (f >= 0) ? 0.5f : -0.5f;
367 *output++ = max(-32768, min(32767, (int)(f)));
368 f = *input2++ * 32768.0f;
369 f += (f >= 0) ? 0.5f : -0.5f;
370 *output++ = max(-32768, min(32767, (int)(f)));
371 }
372 }
373 #define interleave_stereo interleave_stereo
374 #endif
375
dsp_util_deinterleave(int16_t * input,float * const * output,int channels,int frames)376 void dsp_util_deinterleave(int16_t *input, float *const *output, int channels,
377 int frames)
378 {
379 float *output_ptr[channels];
380 int i, j;
381
382 #ifdef deinterleave_stereo
383 if (channels == 2) {
384 deinterleave_stereo(input, output[0], output[1], frames);
385 return;
386 }
387 #endif
388
389 for (i = 0; i < channels; i++)
390 output_ptr[i] = output[i];
391
392 for (i = 0; i < frames; i++)
393 for (j = 0; j < channels; j++)
394 *(output_ptr[j]++) = *input++ / 32768.0f;
395 }
396
dsp_util_interleave(float * const * input,int16_t * output,int channels,int frames)397 void dsp_util_interleave(float *const *input, int16_t *output, int channels,
398 int frames)
399 {
400 float *input_ptr[channels];
401 int i, j;
402
403 #ifdef interleave_stereo
404 if (channels == 2) {
405 interleave_stereo(input[0], input[1], output, frames);
406 return;
407 }
408 #endif
409
410 for (i = 0; i < channels; i++)
411 input_ptr[i] = input[i];
412
413 for (i = 0; i < frames; i++)
414 for (j = 0; j < channels; j++) {
415 float f = *(input_ptr[j]++) * 32768.0f;
416 f += (f >= 0) ? 0.5f : -0.5f;
417 *output++ = max(-32768, min(32767, (int)(f)));
418 }
419 }
420
dsp_enable_flush_denormal_to_zero()421 void dsp_enable_flush_denormal_to_zero()
422 {
423 #if defined(__i386__) || defined(__x86_64__)
424 unsigned int mxcsr;
425 mxcsr = __builtin_ia32_stmxcsr();
426 __builtin_ia32_ldmxcsr(mxcsr | 0x8040);
427 #elif defined(__aarch64__)
428 uint64_t cw;
429 __asm__ __volatile__ (
430 "mrs %0, fpcr \n"
431 "orr %0, %0, #0x1000000 \n"
432 "msr fpcr, %0 \n"
433 "isb \n"
434 : "=r"(cw) :: "memory");
435 #elif defined(__arm__)
436 uint32_t cw;
437 __asm__ __volatile__ (
438 "vmrs %0, fpscr \n"
439 "orr %0, %0, #0x1000000 \n"
440 "vmsr fpscr, %0 \n"
441 : "=r"(cw) :: "memory");
442 #else
443 #warning "Don't know how to disable denorms. Performace may suffer."
444 #endif
445 }
446