• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2013 The Chromium OS Authors. All rights reserved.
2  * Use of this source code is governed by a BSD-style license that can be
3  * found in the LICENSE file.
4  */
5 
6 #include "dsp_util.h"
7 
8 #ifndef max
9 #define max(a, b) ({ __typeof__(a) _a = (a);	\
10 			__typeof__(b) _b = (b);	\
11 			_a > _b ? _a : _b; })
12 #endif
13 
14 #ifndef min
15 #define min(a, b) ({ __typeof__(a) _a = (a);	\
16 			__typeof__(b) _b = (b);	\
17 			_a < _b ? _a : _b; })
18 #endif
19 
20 #undef deinterleave_stereo
21 #undef interleave_stereo
22 
23 /* Converts shorts in range of -32768 to 32767 to floats in range of
24  * -1.0f to 1.0f.
25  * scvtf instruction accepts fixed point ints, so sxtl is used to lengthen
26  * shorts to int with sign extension.
27  */
28 #ifdef __aarch64__
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)29 static void deinterleave_stereo(int16_t *input, float *output1,
30 				float *output2, int frames)
31 {
32 	int chunk = frames >> 3;
33 	frames &= 7;
34 	/* Process 8 frames (16 samples) each loop. */
35 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
36 	if (chunk) {
37 		__asm__ __volatile__ (
38 			"1:                                         \n"
39 			"ld2  {v2.8h, v3.8h}, [%[input]], #32       \n"
40 			"subs %w[chunk], %w[chunk], #1              \n"
41 			"sxtl   v0.4s, v2.4h                        \n"
42 			"sxtl2  v1.4s, v2.8h                        \n"
43 			"sxtl   v2.4s, v3.4h                        \n"
44 			"sxtl2  v3.4s, v3.8h                        \n"
45 			"scvtf  v0.4s, v0.4s, #15                   \n"
46 			"scvtf  v1.4s, v1.4s, #15                   \n"
47 			"scvtf  v2.4s, v2.4s, #15                   \n"
48 			"scvtf  v3.4s, v3.4s, #15                   \n"
49 			"st1    {v0.4s, v1.4s}, [%[output1]], #32   \n"
50 			"st1    {v2.4s, v3.4s}, [%[output2]], #32   \n"
51 			"b.ne   1b                                  \n"
52 			: /* output */
53 			  [chunk]"+r"(chunk),
54 			  [input]"+r"(input),
55 			  [output1]"+r"(output1),
56 			  [output2]"+r"(output2)
57 			: /* input */
58 			: /* clobber */
59 			  "v0", "v1", "v2", "v3", "memory", "cc"
60 			);
61 	}
62 
63 	/* The remaining samples. */
64 	while (frames--) {
65 		*output1++ = *input++ / 32768.0f;
66 		*output2++ = *input++ / 32768.0f;
67 	}
68 }
69 #define deinterleave_stereo deinterleave_stereo
70 
71 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
72  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
73  * from zero.
74  * Rounding is achieved by using fcvtas instruction. (a = away)
75  * The float scaled to a range of -32768 to 32767 by adding 15 to the exponent.
76  * Add to exponent is equivalent to multiply for exponent range of 0 to 239,
77  * which is 2.59 * 10^33.  A signed saturating add (sqadd) limits exponents
78  * from 240 to 255 to clamp to 255.
79  * For very large values, beyond +/- 2 billion, fcvtas will clamp the result
80  * to the min or max value that fits an int.
81  * For other values, sqxtn clamps the output to -32768 to 32767 range.
82  */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)83 static void interleave_stereo(float *input1, float *input2,
84 			      int16_t *output, int frames)
85 {
86 	/* Process 4 frames (8 samples) each loop. */
87 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
88 	int chunk = frames >> 2;
89 	frames &= 3;
90 
91 	if (chunk) {
92 		__asm__ __volatile__ (
93 			"dup    v2.4s, %w[scale]                    \n"
94 			"1:                                         \n"
95 			"ld1    {v0.4s}, [%[input1]], #16           \n"
96 			"ld1    {v1.4s}, [%[input2]], #16           \n"
97 			"subs   %w[chunk], %w[chunk], #1            \n"
98 			"sqadd  v0.4s, v0.4s, v2.4s                 \n"
99 			"sqadd  v1.4s, v1.4s, v2.4s                 \n"
100 			"fcvtas v0.4s, v0.4s                        \n"
101 			"fcvtas v1.4s, v1.4s                        \n"
102 			"sqxtn  v0.4h, v0.4s                        \n"
103 			"sqxtn  v1.4h, v1.4s                        \n"
104 			"st2    {v0.4h, v1.4h}, [%[output]], #16    \n"
105 			"b.ne   1b                                  \n"
106 			: /* output */
107 			  [chunk]"+r"(chunk),
108 			  [input1]"+r"(input1),
109 			  [input2]"+r"(input2),
110 			  [output]"+r"(output)
111 			: /* input */
112 			  [scale]"r"(15 << 23)
113 			: /* clobber */
114 			  "v0", "v1", "v2", "memory", "cc"
115 			);
116 	}
117 
118 	/* The remaining samples */
119 	while (frames--) {
120 		float f;
121 		f = *input1++ * 32768.0f;
122 		f += (f >= 0) ? 0.5f : -0.5f;
123 		*output++ = max(-32768, min(32767, (int)(f)));
124 		f = *input2++ * 32768.0f;
125 		f += (f >= 0) ? 0.5f : -0.5f;
126 		*output++ = max(-32768, min(32767, (int)(f)));
127 	}
128 }
129 #define interleave_stereo interleave_stereo
130 #endif
131 
132 #ifdef __ARM_NEON__
133 #include <arm_neon.h>
134 
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)135 static void deinterleave_stereo(int16_t *input, float *output1,
136 				float *output2, int frames)
137 {
138 	/* Process 8 frames (16 samples) each loop. */
139 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
140 	int chunk = frames >> 3;
141 	frames &= 7;
142 	if (chunk) {
143 		__asm__ __volatile__ (
144 			"1:					    \n"
145 			"vld2.16 {d0-d3}, [%[input]]!		    \n"
146 			"subs %[chunk], #1			    \n"
147 			"vmovl.s16 q3, d3			    \n"
148 			"vmovl.s16 q2, d2			    \n"
149 			"vmovl.s16 q1, d1			    \n"
150 			"vmovl.s16 q0, d0			    \n"
151 			"vcvt.f32.s32 q3, q3, #15		    \n"
152 			"vcvt.f32.s32 q2, q2, #15		    \n"
153 			"vcvt.f32.s32 q1, q1, #15		    \n"
154 			"vcvt.f32.s32 q0, q0, #15		    \n"
155 			"vst1.32 {d4-d7}, [%[output2]]!		    \n"
156 			"vst1.32 {d0-d3}, [%[output1]]!		    \n"
157 			"bne 1b					    \n"
158 			: /* output */
159 			  [chunk]"+r"(chunk),
160 			  [input]"+r"(input),
161 			  [output1]"+r"(output1),
162 			  [output2]"+r"(output2)
163 			: /* input */
164 			: /* clobber */
165 			  "q0", "q1", "q2", "q3", "memory", "cc"
166 			);
167 	}
168 
169 	/* The remaining samples. */
170 	while (frames--) {
171 		*output1++ = *input++ / 32768.0f;
172 		*output2++ = *input++ / 32768.0f;
173 	}
174 }
175 #define deinterleave_stereo deinterleave_stereo
176 
177 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
178  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
179  * from zero.
180  * Rounding is achieved by adding 0.5 or -0.5 adjusted for fixed point
181  * precision, and then converting float to fixed point using vcvt instruction
182  * which truncated toward zero.
183  * For very large values, beyond +/- 2 billion, vcvt will clamp the result
184  * to the min or max value that fits an int.
185  * For other values, vqmovn clamps the output to -32768 to 32767 range.
186  */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)187 static void interleave_stereo(float *input1, float *input2,
188 			      int16_t *output, int frames)
189 {
190 	/* Process 4 frames (8 samples) each loop. */
191 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
192 	float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
193 	float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
194 	int chunk = frames >> 2;
195 	frames &= 3;
196 
197 	if (chunk) {
198 		__asm__ __volatile__ (
199 			"veor q0, q0, q0			    \n"
200 			"1:					    \n"
201 			"vld1.32 {d2-d3}, [%[input1]]!		    \n"
202 			"vld1.32 {d4-d5}, [%[input2]]!		    \n"
203 			"subs %[chunk], #1			    \n"
204 			/* We try to round to the nearest number by adding 0.5
205 			 * to positive input, and adding -0.5 to the negative
206 			 * input, then truncate.
207 			 */
208 			"vcgt.f32 q3, q1, q0			    \n"
209 			"vcgt.f32 q4, q2, q0			    \n"
210 			"vbsl q3, %q[pos], %q[neg]		    \n"
211 			"vbsl q4, %q[pos], %q[neg]		    \n"
212 			"vadd.f32 q1, q1, q3			    \n"
213 			"vadd.f32 q2, q2, q4			    \n"
214 			"vcvt.s32.f32 q1, q1, #15		    \n"
215 			"vcvt.s32.f32 q2, q2, #15		    \n"
216 			"vqmovn.s32 d2, q1			    \n"
217 			"vqmovn.s32 d3, q2			    \n"
218 			"vst2.16 {d2-d3}, [%[output]]!		    \n"
219 			"bne 1b					    \n"
220 			: /* output */
221 			  [chunk]"+r"(chunk),
222 			  [input1]"+r"(input1),
223 			  [input2]"+r"(input2),
224 			  [output]"+r"(output)
225 			: /* input */
226 			  [pos]"w"(pos),
227 			  [neg]"w"(neg)
228 			: /* clobber */
229 			  "q0", "q1", "q2", "q3", "q4", "memory", "cc"
230 			);
231 	}
232 
233 	/* The remaining samples */
234 	while (frames--) {
235 		float f;
236 		f = *input1++ * 32768.0f;
237 		f += (f >= 0) ? 0.5f : -0.5f;
238 		*output++ = max(-32768, min(32767, (int)(f)));
239 		f = *input2++ * 32768.0f;
240 		f += (f >= 0) ? 0.5f : -0.5f;
241 		*output++ = max(-32768, min(32767, (int)(f)));
242 	}
243 }
244 #define interleave_stereo interleave_stereo
245 #endif
246 
247 #ifdef __SSE3__
248 #include <emmintrin.h>
249 
250 /* Converts shorts in range of -32768 to 32767 to floats in range of
251  * -1.0f to 1.0f.
252  * pslld and psrad shifts are used to isolate the low and high word, but
253  * each in a different range:
254  * The low word is shifted to the high bits in range 0x80000000 .. 0x7fff0000.
255  * The high word is shifted to the low bits in range 0x00008000 .. 0x00007fff.
256  * cvtdq2ps converts ints to floats as is.
257  * mulps is used to normalize the range of the low and high words, adjusting
258  * for high and low words being in different range.
259  */
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)260 static void deinterleave_stereo(int16_t *input, float *output1,
261 				float *output2, int frames)
262 {
263 	/* Process 8 frames (16 samples) each loop. */
264 	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
265 	int chunk = frames >> 3;
266 	frames &= 7;
267 	if (chunk) {
268 		__asm__ __volatile__ (
269 			"1:                                         \n"
270 			"lddqu (%[input]), %%xmm0                   \n"
271 			"lddqu 16(%[input]), %%xmm1                 \n"
272 			"add $32, %[input]                          \n"
273 			"movdqa %%xmm0, %%xmm2                      \n"
274 			"movdqa %%xmm1, %%xmm3                      \n"
275 			"pslld $16, %%xmm0                          \n"
276 			"pslld $16, %%xmm1                          \n"
277 			"psrad $16, %%xmm2                          \n"
278 			"psrad $16, %%xmm3                          \n"
279 			"cvtdq2ps %%xmm0, %%xmm0                    \n"
280 			"cvtdq2ps %%xmm1, %%xmm1                    \n"
281 			"cvtdq2ps %%xmm2, %%xmm2                    \n"
282 			"cvtdq2ps %%xmm3, %%xmm3                    \n"
283 			"mulps %[scale_2_n31], %%xmm0               \n"
284 			"mulps %[scale_2_n31], %%xmm1               \n"
285 			"mulps %[scale_2_n15], %%xmm2               \n"
286 			"mulps %[scale_2_n15], %%xmm3               \n"
287 			"movdqu %%xmm0, (%[output1])                \n"
288 			"movdqu %%xmm1, 16(%[output1])              \n"
289 			"movdqu %%xmm2, (%[output2])                \n"
290 			"movdqu %%xmm3, 16(%[output2])              \n"
291 			"add $32, %[output1]                        \n"
292 			"add $32, %[output2]                        \n"
293 			"sub $1, %[chunk]                           \n"
294 			"jnz 1b                                     \n"
295 			: /* output */
296 			  [chunk]"+r"(chunk),
297 			  [input]"+r"(input),
298 			  [output1]"+r"(output1),
299 			  [output2]"+r"(output2)
300 			: /* input */
301 			  [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
302 			  [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
303 			: /* clobber */
304 			  "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc"
305 			);
306 	}
307 
308 	/* The remaining samples. */
309 	while (frames--) {
310 		*output1++ = *input++ / 32768.0f;
311 		*output2++ = *input++ / 32768.0f;
312 	}
313 }
314 #define deinterleave_stereo deinterleave_stereo
315 
316 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
317  * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding to
318  * even.
319  * For very large values, beyond +/- 2 billion, cvtps2dq will produce
320  * 0x80000000 and packssdw will clamp -32768.
321  */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)322 static void interleave_stereo(float *input1, float *input2,
323 			      int16_t *output, int frames)
324 {
325 	/* Process 4 frames (8 samples) each loop. */
326 	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
327 	int chunk = frames >> 2;
328 	frames &= 3;
329 
330 	if (chunk) {
331 		__asm__ __volatile__ (
332 			"1:                                         \n"
333 			"lddqu (%[input1]), %%xmm0                  \n"
334 			"lddqu (%[input2]), %%xmm2                  \n"
335 			"add $16, %[input1]                         \n"
336 			"add $16, %[input2]                         \n"
337 			"movaps %%xmm0, %%xmm1                      \n"
338 			"unpcklps %%xmm2, %%xmm0                    \n"
339 			"unpckhps %%xmm2, %%xmm1                    \n"
340 			"paddsw %[scale_2_15], %%xmm0               \n"
341 			"paddsw %[scale_2_15], %%xmm1               \n"
342 			"cvtps2dq %%xmm0, %%xmm0                    \n"
343 			"cvtps2dq %%xmm1, %%xmm1                    \n"
344 			"packssdw %%xmm1, %%xmm0                    \n"
345 			"movdqu %%xmm0, (%[output])                 \n"
346 			"add $16, %[output]                         \n"
347 			"sub $1, %[chunk]                           \n"
348 			"jnz 1b                                     \n"
349 			: /* output */
350 			  [chunk]"+r"(chunk),
351 			  [input1]"+r"(input1),
352 			  [input2]"+r"(input2),
353 			  [output]"+r"(output)
354 			: /* input */
355 			  [scale_2_15]"x"(_mm_set1_epi32(15 << 23)),
356 			  [clamp_large]"x"(_mm_set1_ps(32767.0f))
357 			: /* clobber */
358 			  "xmm0", "xmm1", "xmm2", "memory", "cc"
359 			);
360 	}
361 
362 	/* The remaining samples */
363 	while (frames--) {
364 		float f;
365 		f = *input1++ * 32768.0f;
366 		f += (f >= 0) ? 0.5f : -0.5f;
367 		*output++ = max(-32768, min(32767, (int)(f)));
368 		f = *input2++ * 32768.0f;
369 		f += (f >= 0) ? 0.5f : -0.5f;
370 		*output++ = max(-32768, min(32767, (int)(f)));
371 	}
372 }
373 #define interleave_stereo interleave_stereo
374 #endif
375 
dsp_util_deinterleave(int16_t * input,float * const * output,int channels,int frames)376 void dsp_util_deinterleave(int16_t *input, float *const *output, int channels,
377 			   int frames)
378 {
379 	float *output_ptr[channels];
380 	int i, j;
381 
382 #ifdef deinterleave_stereo
383 	if (channels == 2) {
384 		deinterleave_stereo(input, output[0], output[1], frames);
385 		return;
386 	}
387 #endif
388 
389 	for (i = 0; i < channels; i++)
390 		output_ptr[i] = output[i];
391 
392 	for (i = 0; i < frames; i++)
393 		for (j = 0; j < channels; j++)
394 			*(output_ptr[j]++) = *input++ / 32768.0f;
395 }
396 
dsp_util_interleave(float * const * input,int16_t * output,int channels,int frames)397 void dsp_util_interleave(float *const *input, int16_t *output, int channels,
398 			 int frames)
399 {
400 	float *input_ptr[channels];
401 	int i, j;
402 
403 #ifdef interleave_stereo
404 	if (channels == 2) {
405 		interleave_stereo(input[0], input[1], output, frames);
406 		return;
407 	}
408 #endif
409 
410 	for (i = 0; i < channels; i++)
411 		input_ptr[i] = input[i];
412 
413 	for (i = 0; i < frames; i++)
414 		for (j = 0; j < channels; j++) {
415 			float f = *(input_ptr[j]++) * 32768.0f;
416 			f += (f >= 0) ? 0.5f : -0.5f;
417 			*output++ = max(-32768, min(32767, (int)(f)));
418 		}
419 }
420 
dsp_enable_flush_denormal_to_zero()421 void dsp_enable_flush_denormal_to_zero()
422 {
423 #if defined(__i386__) || defined(__x86_64__)
424 	unsigned int mxcsr;
425 	mxcsr = __builtin_ia32_stmxcsr();
426 	__builtin_ia32_ldmxcsr(mxcsr | 0x8040);
427 #elif defined(__aarch64__)
428 	uint64_t cw;
429 	__asm__ __volatile__ (
430 		"mrs    %0, fpcr			    \n"
431 		"orr    %0, %0, #0x1000000		    \n"
432 		"msr    fpcr, %0			    \n"
433 		"isb					    \n"
434 		: "=r"(cw) :: "memory");
435 #elif defined(__arm__)
436 	uint32_t cw;
437 	__asm__ __volatile__ (
438 		"vmrs   %0, fpscr			    \n"
439 		"orr    %0, %0, #0x1000000		    \n"
440 		"vmsr   fpscr, %0			    \n"
441 		: "=r"(cw) :: "memory");
442 #else
443 #warning "Don't know how to disable denorms. Performace may suffer."
444 #endif
445 }
446