1 /* Copyright 2013 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6 #include <limits.h>
7 #include <syslog.h>
8
9 #include "dsp_util.h"
10
11 #ifndef max
12 #define max(a, b) ({ __typeof__(a) _a = (a); \
13 __typeof__(b) _b = (b); \
14 _a > _b ? _a : _b; })
15 #endif
16
17 #ifndef min
18 #define min(a, b) ({ __typeof__(a) _a = (a); \
19 __typeof__(b) _b = (b); \
20 _a < _b ? _a : _b; })
21 #endif
22
23 #undef deinterleave_stereo
24 #undef interleave_stereo
25
26 /* Converts shorts in range of -32768 to 32767 to floats in range of
27 * -1.0f to 1.0f.
28 * scvtf instruction accepts fixed point ints, so sxtl is used to lengthen
29 * shorts to int with sign extension.
30 */
31 #ifdef __aarch64__
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)32 static void deinterleave_stereo(int16_t *input, float *output1,
33 float *output2, int frames)
34 {
35 int chunk = frames >> 3;
36 frames &= 7;
37 /* Process 8 frames (16 samples) each loop. */
38 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
39 if (chunk) {
40 __asm__ __volatile__ (
41 "1: \n"
42 "ld2 {v2.8h, v3.8h}, [%[input]], #32 \n"
43 "subs %w[chunk], %w[chunk], #1 \n"
44 "sxtl v0.4s, v2.4h \n"
45 "sxtl2 v1.4s, v2.8h \n"
46 "sxtl v2.4s, v3.4h \n"
47 "sxtl2 v3.4s, v3.8h \n"
48 "scvtf v0.4s, v0.4s, #15 \n"
49 "scvtf v1.4s, v1.4s, #15 \n"
50 "scvtf v2.4s, v2.4s, #15 \n"
51 "scvtf v3.4s, v3.4s, #15 \n"
52 "st1 {v0.4s, v1.4s}, [%[output1]], #32 \n"
53 "st1 {v2.4s, v3.4s}, [%[output2]], #32 \n"
54 "b.ne 1b \n"
55 : /* output */
56 [chunk]"+r"(chunk),
57 [input]"+r"(input),
58 [output1]"+r"(output1),
59 [output2]"+r"(output2)
60 : /* input */
61 : /* clobber */
62 "v0", "v1", "v2", "v3", "memory", "cc"
63 );
64 }
65
66 /* The remaining samples. */
67 while (frames--) {
68 *output1++ = *input++ / 32768.0f;
69 *output2++ = *input++ / 32768.0f;
70 }
71 }
72 #define deinterleave_stereo deinterleave_stereo
73
74 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
75 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
76 * from zero.
77 * Rounding is achieved by using fcvtas instruction. (a = away)
78 * The float scaled to a range of -32768 to 32767 by adding 15 to the exponent.
79 * Add to exponent is equivalent to multiply for exponent range of 0 to 239,
80 * which is 2.59 * 10^33. A signed saturating add (sqadd) limits exponents
81 * from 240 to 255 to clamp to 255.
82 * For very large values, beyond +/- 2 billion, fcvtas will clamp the result
83 * to the min or max value that fits an int.
84 * For other values, sqxtn clamps the output to -32768 to 32767 range.
85 */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)86 static void interleave_stereo(float *input1, float *input2,
87 int16_t *output, int frames)
88 {
89 /* Process 4 frames (8 samples) each loop. */
90 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
91 int chunk = frames >> 2;
92 frames &= 3;
93
94 if (chunk) {
95 __asm__ __volatile__ (
96 "dup v2.4s, %w[scale] \n"
97 "1: \n"
98 "ld1 {v0.4s}, [%[input1]], #16 \n"
99 "ld1 {v1.4s}, [%[input2]], #16 \n"
100 "subs %w[chunk], %w[chunk], #1 \n"
101 "sqadd v0.4s, v0.4s, v2.4s \n"
102 "sqadd v1.4s, v1.4s, v2.4s \n"
103 "fcvtas v0.4s, v0.4s \n"
104 "fcvtas v1.4s, v1.4s \n"
105 "sqxtn v0.4h, v0.4s \n"
106 "sqxtn v1.4h, v1.4s \n"
107 "st2 {v0.4h, v1.4h}, [%[output]], #16 \n"
108 "b.ne 1b \n"
109 : /* output */
110 [chunk]"+r"(chunk),
111 [input1]"+r"(input1),
112 [input2]"+r"(input2),
113 [output]"+r"(output)
114 : /* input */
115 [scale]"r"(15 << 23)
116 : /* clobber */
117 "v0", "v1", "v2", "memory", "cc"
118 );
119 }
120
121 /* The remaining samples */
122 while (frames--) {
123 float f;
124 f = *input1++ * 32768.0f;
125 f += (f >= 0) ? 0.5f : -0.5f;
126 *output++ = max(-32768, min(32767, (int)(f)));
127 f = *input2++ * 32768.0f;
128 f += (f >= 0) ? 0.5f : -0.5f;
129 *output++ = max(-32768, min(32767, (int)(f)));
130 }
131 }
132 #define interleave_stereo interleave_stereo
133 #endif
134
135 #ifdef __ARM_NEON__
136 #include <arm_neon.h>
137
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)138 static void deinterleave_stereo(int16_t *input, float *output1,
139 float *output2, int frames)
140 {
141 /* Process 8 frames (16 samples) each loop. */
142 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
143 int chunk = frames >> 3;
144 frames &= 7;
145 if (chunk) {
146 __asm__ __volatile__ (
147 "1: \n"
148 "vld2.16 {d0-d3}, [%[input]]! \n"
149 "subs %[chunk], #1 \n"
150 "vmovl.s16 q3, d3 \n"
151 "vmovl.s16 q2, d2 \n"
152 "vmovl.s16 q1, d1 \n"
153 "vmovl.s16 q0, d0 \n"
154 "vcvt.f32.s32 q3, q3, #15 \n"
155 "vcvt.f32.s32 q2, q2, #15 \n"
156 "vcvt.f32.s32 q1, q1, #15 \n"
157 "vcvt.f32.s32 q0, q0, #15 \n"
158 "vst1.32 {d4-d7}, [%[output2]]! \n"
159 "vst1.32 {d0-d3}, [%[output1]]! \n"
160 "bne 1b \n"
161 : /* output */
162 [chunk]"+r"(chunk),
163 [input]"+r"(input),
164 [output1]"+r"(output1),
165 [output2]"+r"(output2)
166 : /* input */
167 : /* clobber */
168 "q0", "q1", "q2", "q3", "memory", "cc"
169 );
170 }
171
172 /* The remaining samples. */
173 while (frames--) {
174 *output1++ = *input++ / 32768.0f;
175 *output2++ = *input++ / 32768.0f;
176 }
177 }
178 #define deinterleave_stereo deinterleave_stereo
179
180 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
181 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
182 * from zero.
183 * Rounding is achieved by adding 0.5 or -0.5 adjusted for fixed point
184 * precision, and then converting float to fixed point using vcvt instruction
185 * which truncated toward zero.
186 * For very large values, beyond +/- 2 billion, vcvt will clamp the result
187 * to the min or max value that fits an int.
188 * For other values, vqmovn clamps the output to -32768 to 32767 range.
189 */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)190 static void interleave_stereo(float *input1, float *input2,
191 int16_t *output, int frames)
192 {
193 /* Process 4 frames (8 samples) each loop. */
194 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
195 float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
196 float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
197 int chunk = frames >> 2;
198 frames &= 3;
199
200 if (chunk) {
201 __asm__ __volatile__ (
202 "veor q0, q0, q0 \n"
203 "1: \n"
204 "vld1.32 {d2-d3}, [%[input1]]! \n"
205 "vld1.32 {d4-d5}, [%[input2]]! \n"
206 "subs %[chunk], #1 \n"
207 /* We try to round to the nearest number by adding 0.5
208 * to positive input, and adding -0.5 to the negative
209 * input, then truncate.
210 */
211 "vcgt.f32 q3, q1, q0 \n"
212 "vcgt.f32 q4, q2, q0 \n"
213 "vbsl q3, %q[pos], %q[neg] \n"
214 "vbsl q4, %q[pos], %q[neg] \n"
215 "vadd.f32 q1, q1, q3 \n"
216 "vadd.f32 q2, q2, q4 \n"
217 "vcvt.s32.f32 q1, q1, #15 \n"
218 "vcvt.s32.f32 q2, q2, #15 \n"
219 "vqmovn.s32 d2, q1 \n"
220 "vqmovn.s32 d3, q2 \n"
221 "vst2.16 {d2-d3}, [%[output]]! \n"
222 "bne 1b \n"
223 : /* output */
224 [chunk]"+r"(chunk),
225 [input1]"+r"(input1),
226 [input2]"+r"(input2),
227 [output]"+r"(output)
228 : /* input */
229 [pos]"w"(pos),
230 [neg]"w"(neg)
231 : /* clobber */
232 "q0", "q1", "q2", "q3", "q4", "memory", "cc"
233 );
234 }
235
236 /* The remaining samples */
237 while (frames--) {
238 float f;
239 f = *input1++ * 32768.0f;
240 f += (f >= 0) ? 0.5f : -0.5f;
241 *output++ = max(-32768, min(32767, (int)(f)));
242 f = *input2++ * 32768.0f;
243 f += (f >= 0) ? 0.5f : -0.5f;
244 *output++ = max(-32768, min(32767, (int)(f)));
245 }
246 }
247 #define interleave_stereo interleave_stereo
248 #endif
249
250 #ifdef __SSE3__
251 #include <emmintrin.h>
252
253 /* Converts shorts in range of -32768 to 32767 to floats in range of
254 * -1.0f to 1.0f.
255 * pslld and psrad shifts are used to isolate the low and high word, but
256 * each in a different range:
257 * The low word is shifted to the high bits in range 0x80000000 .. 0x7fff0000.
258 * The high word is shifted to the low bits in range 0x00008000 .. 0x00007fff.
259 * cvtdq2ps converts ints to floats as is.
260 * mulps is used to normalize the range of the low and high words, adjusting
261 * for high and low words being in different range.
262 */
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)263 static void deinterleave_stereo(int16_t *input, float *output1,
264 float *output2, int frames)
265 {
266 /* Process 8 frames (16 samples) each loop. */
267 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
268 int chunk = frames >> 3;
269 frames &= 7;
270 if (chunk) {
271 __asm__ __volatile__ (
272 "1: \n"
273 "lddqu (%[input]), %%xmm0 \n"
274 "lddqu 16(%[input]), %%xmm1 \n"
275 "add $32, %[input] \n"
276 "movdqa %%xmm0, %%xmm2 \n"
277 "movdqa %%xmm1, %%xmm3 \n"
278 "pslld $16, %%xmm0 \n"
279 "pslld $16, %%xmm1 \n"
280 "psrad $16, %%xmm2 \n"
281 "psrad $16, %%xmm3 \n"
282 "cvtdq2ps %%xmm0, %%xmm0 \n"
283 "cvtdq2ps %%xmm1, %%xmm1 \n"
284 "cvtdq2ps %%xmm2, %%xmm2 \n"
285 "cvtdq2ps %%xmm3, %%xmm3 \n"
286 "mulps %[scale_2_n31], %%xmm0 \n"
287 "mulps %[scale_2_n31], %%xmm1 \n"
288 "mulps %[scale_2_n15], %%xmm2 \n"
289 "mulps %[scale_2_n15], %%xmm3 \n"
290 "movdqu %%xmm0, (%[output1]) \n"
291 "movdqu %%xmm1, 16(%[output1]) \n"
292 "movdqu %%xmm2, (%[output2]) \n"
293 "movdqu %%xmm3, 16(%[output2]) \n"
294 "add $32, %[output1] \n"
295 "add $32, %[output2] \n"
296 "sub $1, %[chunk] \n"
297 "jnz 1b \n"
298 : /* output */
299 [chunk]"+r"(chunk),
300 [input]"+r"(input),
301 [output1]"+r"(output1),
302 [output2]"+r"(output2)
303 : /* input */
304 [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
305 [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
306 : /* clobber */
307 "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc"
308 );
309 }
310
311 /* The remaining samples. */
312 while (frames--) {
313 *output1++ = *input++ / 32768.0f;
314 *output2++ = *input++ / 32768.0f;
315 }
316 }
317 #define deinterleave_stereo deinterleave_stereo
318
319 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
320 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding to
321 * even.
322 * For very large values, beyond +/- 2 billion, cvtps2dq will produce
323 * 0x80000000 and packssdw will clamp -32768.
324 */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)325 static void interleave_stereo(float *input1, float *input2,
326 int16_t *output, int frames)
327 {
328 /* Process 4 frames (8 samples) each loop. */
329 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
330 int chunk = frames >> 2;
331 frames &= 3;
332
333 if (chunk) {
334 __asm__ __volatile__ (
335 "1: \n"
336 "lddqu (%[input1]), %%xmm0 \n"
337 "lddqu (%[input2]), %%xmm2 \n"
338 "add $16, %[input1] \n"
339 "add $16, %[input2] \n"
340 "movaps %%xmm0, %%xmm1 \n"
341 "unpcklps %%xmm2, %%xmm0 \n"
342 "unpckhps %%xmm2, %%xmm1 \n"
343 "paddsw %[scale_2_15], %%xmm0 \n"
344 "paddsw %[scale_2_15], %%xmm1 \n"
345 "cvtps2dq %%xmm0, %%xmm0 \n"
346 "cvtps2dq %%xmm1, %%xmm1 \n"
347 "packssdw %%xmm1, %%xmm0 \n"
348 "movdqu %%xmm0, (%[output]) \n"
349 "add $16, %[output] \n"
350 "sub $1, %[chunk] \n"
351 "jnz 1b \n"
352 : /* output */
353 [chunk]"+r"(chunk),
354 [input1]"+r"(input1),
355 [input2]"+r"(input2),
356 [output]"+r"(output)
357 : /* input */
358 [scale_2_15]"x"(_mm_set1_epi32(15 << 23)),
359 [clamp_large]"x"(_mm_set1_ps(32767.0f))
360 : /* clobber */
361 "xmm0", "xmm1", "xmm2", "memory", "cc"
362 );
363 }
364
365 /* The remaining samples */
366 while (frames--) {
367 float f;
368 f = *input1++ * 32768.0f;
369 f += (f >= 0) ? 0.5f : -0.5f;
370 *output++ = max(-32768, min(32767, (int)(f)));
371 f = *input2++ * 32768.0f;
372 f += (f >= 0) ? 0.5f : -0.5f;
373 *output++ = max(-32768, min(32767, (int)(f)));
374 }
375 }
376 #define interleave_stereo interleave_stereo
377 #endif
378
dsp_util_deinterleave_s16le(int16_t * input,float * const * output,int channels,int frames)379 static void dsp_util_deinterleave_s16le(int16_t *input, float *const *output,
380 int channels, int frames)
381 {
382 float *output_ptr[channels];
383 int i, j;
384
385 #ifdef deinterleave_stereo
386 if (channels == 2) {
387 deinterleave_stereo(input, output[0], output[1], frames);
388 return;
389 }
390 #endif
391
392 for (i = 0; i < channels; i++)
393 output_ptr[i] = output[i];
394
395 for (i = 0; i < frames; i++)
396 for (j = 0; j < channels; j++)
397 *(output_ptr[j]++) = *input++ / 32768.0f;
398 }
399
400
dsp_util_deinterleave_s24le(int32_t * input,float * const * output,int channels,int frames)401 static void dsp_util_deinterleave_s24le(int32_t *input, float *const *output,
402 int channels, int frames)
403 {
404 float *output_ptr[channels];
405 int i, j;
406
407 for (i = 0; i < channels; i++)
408 output_ptr[i] = output[i];
409
410 for (i = 0; i < frames; i++)
411 for (j = 0; j < channels; j++, input++)
412 *(output_ptr[j]++) =
413 (*input << 8) / 2147483648.0f;
414 }
415
dsp_util_deinterleave_s243le(uint8_t * input,float * const * output,int channels,int frames)416 static void dsp_util_deinterleave_s243le(uint8_t *input, float *const *output,
417 int channels, int frames)
418 {
419 float *output_ptr[channels];
420 int32_t sample;
421 int i, j;
422
423 for (i = 0; i < channels; i++)
424 output_ptr[i] = output[i];
425
426 for (i = 0; i < frames; i++)
427 for (j = 0; j < channels; j++, input += 3) {
428 sample = 0;
429 memcpy((uint8_t *)&sample + 1, input, 3);
430 *(output_ptr[j]++) = sample / 2147483648.0f;
431 }
432 }
433
dsp_util_deinterleave_s32le(int32_t * input,float * const * output,int channels,int frames)434 static void dsp_util_deinterleave_s32le(int32_t *input, float *const *output,
435 int channels, int frames)
436 {
437 float *output_ptr[channels];
438 int i, j;
439
440 for (i = 0; i < channels; i++)
441 output_ptr[i] = output[i];
442
443 for (i = 0; i < frames; i++)
444 for (j = 0; j < channels; j++, input++)
445 *(output_ptr[j]++) = *input / 2147483648.0f;
446 }
447
dsp_util_deinterleave(uint8_t * input,float * const * output,int channels,snd_pcm_format_t format,int frames)448 int dsp_util_deinterleave(uint8_t *input, float *const *output, int channels,
449 snd_pcm_format_t format, int frames)
450 {
451 switch (format) {
452 case SND_PCM_FORMAT_S16_LE:
453 dsp_util_deinterleave_s16le((int16_t *)input, output,
454 channels, frames);
455 break;
456 case SND_PCM_FORMAT_S24_LE:
457 dsp_util_deinterleave_s24le((int32_t *)input, output,
458 channels, frames);
459 break;
460 case SND_PCM_FORMAT_S24_3LE:
461 dsp_util_deinterleave_s243le(input, output,
462 channels, frames);
463 break;
464 case SND_PCM_FORMAT_S32_LE:
465 dsp_util_deinterleave_s32le((int32_t *)input, output,
466 channels, frames);
467 break;
468 default:
469 syslog(LOG_ERR, "Invalid format to deinterleave");
470 return -EINVAL;
471 }
472 return 0;
473 }
474
dsp_util_interleave_s16le(float * const * input,int16_t * output,int channels,int frames)475 static void dsp_util_interleave_s16le(float *const *input, int16_t *output,
476 int channels, int frames)
477 {
478 float *input_ptr[channels];
479 int i, j;
480
481 #ifdef interleave_stereo
482 if (channels == 2) {
483 interleave_stereo(input[0], input[1], output, frames);
484 return;
485 }
486 #endif
487
488 for (i = 0; i < channels; i++)
489 input_ptr[i] = input[i];
490
491 for (i = 0; i < frames; i++)
492 for (j = 0; j < channels; j++) {
493 float f = *(input_ptr[j]++) * 32768.0f;
494 f += (f >= 0) ? 0.5f : -0.5f;
495 *output++ = max(-32768, min(32767, (int)(f)));
496 }
497 }
498
dsp_util_interleave_s24le(float * const * input,int32_t * output,int channels,int frames)499 static void dsp_util_interleave_s24le(float *const *input, int32_t *output,
500 int channels, int frames)
501 {
502 float *input_ptr[channels];
503 int i, j;
504
505 for (i = 0; i < channels; i++)
506 input_ptr[i] = input[i];
507
508 for (i = 0; i < frames; i++)
509 for (j = 0; j < channels; j++, output++) {
510 float f = *(input_ptr[j]++) * 2147483648.0f;
511 f += (f >= 0) ? 0.5f : -0.5f;
512 *output = max((float)INT_MIN, min((float)INT_MAX, f));
513 *output >>= 8;
514 }
515 }
516
dsp_util_interleave_s243le(float * const * input,uint8_t * output,int channels,int frames)517 static void dsp_util_interleave_s243le(float *const *input, uint8_t *output,
518 int channels, int frames)
519 {
520 float *input_ptr[channels];
521 int i, j;
522 int32_t tmp;
523
524 for (i = 0; i < channels; i++)
525 input_ptr[i] = input[i];
526
527 for (i = 0; i < frames; i++)
528 for (j = 0; j < channels; j++, output += 3) {
529 float f = *(input_ptr[j]++) * 2147483648.0f;
530 f += (f >= 0) ? 0.5f : -0.5f;
531 tmp = max((float)INT_MIN, min((float)INT_MAX, f));
532 tmp >>= 8;
533 memcpy(output, &tmp, 3);
534 }
535 }
536
dsp_util_interleave_s32le(float * const * input,int32_t * output,int channels,int frames)537 static void dsp_util_interleave_s32le(float *const *input, int32_t *output,
538 int channels, int frames)
539 {
540 float *input_ptr[channels];
541 int i, j;
542
543 for (i = 0; i < channels; i++)
544 input_ptr[i] = input[i];
545
546 for (i = 0; i < frames; i++)
547 for (j = 0; j < channels; j++, output++) {
548 float f = *(input_ptr[j]++) * 2147483648.0f;
549 f += (f >= 0) ? 0.5f : -0.5f;
550 *output = max((float)INT_MIN, min((float)INT_MAX, f));
551 }
552 }
553
dsp_util_interleave(float * const * input,uint8_t * output,int channels,snd_pcm_format_t format,int frames)554 int dsp_util_interleave(float *const *input, uint8_t *output, int channels,
555 snd_pcm_format_t format, int frames)
556 {
557 switch (format) {
558 case SND_PCM_FORMAT_S16_LE:
559 dsp_util_interleave_s16le(input, (int16_t *)output,
560 channels, frames);
561 break;
562 case SND_PCM_FORMAT_S24_LE:
563 dsp_util_interleave_s24le(input, (int32_t *)output,
564 channels, frames);
565 break;
566 case SND_PCM_FORMAT_S24_3LE:
567 dsp_util_interleave_s243le(input, output, channels, frames);
568 break;
569 case SND_PCM_FORMAT_S32_LE:
570 dsp_util_interleave_s32le(input, (int32_t *)output,
571 channels, frames);
572 break;
573 default:
574 syslog(LOG_ERR, "Invalid format to interleave");
575 return -EINVAL;
576 }
577 return 0;
578 }
579
dsp_enable_flush_denormal_to_zero()580 void dsp_enable_flush_denormal_to_zero()
581 {
582 #if defined(__i386__) || defined(__x86_64__)
583 unsigned int mxcsr;
584 mxcsr = __builtin_ia32_stmxcsr();
585 __builtin_ia32_ldmxcsr(mxcsr | 0x8040);
586 #elif defined(__aarch64__)
587 uint64_t cw;
588 __asm__ __volatile__ (
589 "mrs %0, fpcr \n"
590 "orr %0, %0, #0x1000000 \n"
591 "msr fpcr, %0 \n"
592 "isb \n"
593 : "=r"(cw) :: "memory");
594 #elif defined(__arm__)
595 uint32_t cw;
596 __asm__ __volatile__ (
597 "vmrs %0, fpscr \n"
598 "orr %0, %0, #0x1000000 \n"
599 "vmsr fpscr, %0 \n"
600 : "=r"(cw) :: "memory");
601 #else
602 #warning "Don't know how to disable denorms. Performace may suffer."
603 #endif
604 }
605