1 /* Copyright 2013 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6 #include <limits.h>
7 #include <syslog.h>
8
9 #include "dsp_util.h"
10
11 #ifndef max
12 #define max(a, b) \
13 ({ \
14 __typeof__(a) _a = (a); \
15 __typeof__(b) _b = (b); \
16 _a > _b ? _a : _b; \
17 })
18 #endif
19
20 #ifndef min
21 #define min(a, b) \
22 ({ \
23 __typeof__(a) _a = (a); \
24 __typeof__(b) _b = (b); \
25 _a < _b ? _a : _b; \
26 })
27 #endif
28
29 #undef deinterleave_stereo
30 #undef interleave_stereo
31
32 /* Converts shorts in range of -32768 to 32767 to floats in range of
33 * -1.0f to 1.0f.
34 * scvtf instruction accepts fixed point ints, so sxtl is used to lengthen
35 * shorts to int with sign extension.
36 */
37 #ifdef __aarch64__
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)38 static void deinterleave_stereo(int16_t *input, float *output1, float *output2,
39 int frames)
40 {
41 int chunk = frames >> 3;
42 frames &= 7;
43 /* Process 8 frames (16 samples) each loop. */
44 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
45 if (chunk) {
46 // clang-format off
47 __asm__ __volatile__(
48 "1: \n"
49 "ld2 {v2.8h, v3.8h}, [%[input]], #32 \n"
50 "subs %w[chunk], %w[chunk], #1 \n"
51 "sxtl v0.4s, v2.4h \n"
52 "sxtl2 v1.4s, v2.8h \n"
53 "sxtl v2.4s, v3.4h \n"
54 "sxtl2 v3.4s, v3.8h \n"
55 "scvtf v0.4s, v0.4s, #15 \n"
56 "scvtf v1.4s, v1.4s, #15 \n"
57 "scvtf v2.4s, v2.4s, #15 \n"
58 "scvtf v3.4s, v3.4s, #15 \n"
59 "st1 {v0.4s, v1.4s}, [%[output1]], #32 \n"
60 "st1 {v2.4s, v3.4s}, [%[output2]], #32 \n"
61 "b.ne 1b \n"
62 : /* output */
63 [chunk]"+r"(chunk),
64 [input]"+r"(input),
65 [output1]"+r"(output1),
66 [output2]"+r"(output2)
67 : /* input */
68 : /* clobber */
69 "v0", "v1", "v2", "v3", "memory", "cc");
70 // clang-format on
71 }
72
73 /* The remaining samples. */
74 while (frames--) {
75 *output1++ = *input++ / 32768.0f;
76 *output2++ = *input++ / 32768.0f;
77 }
78 }
79 #define deinterleave_stereo deinterleave_stereo
80
81 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
82 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
83 * from zero.
84 * Rounding is achieved by using fcvtas instruction. (a = away)
85 * The float scaled to a range of -32768 to 32767 by adding 15 to the exponent.
86 * Add to exponent is equivalent to multiply for exponent range of 0 to 239,
87 * which is 2.59 * 10^33. A signed saturating add (sqadd) limits exponents
88 * from 240 to 255 to clamp to 255.
89 * For very large values, beyond +/- 2 billion, fcvtas will clamp the result
90 * to the min or max value that fits an int.
91 * For other values, sqxtn clamps the output to -32768 to 32767 range.
92 */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)93 static void interleave_stereo(float *input1, float *input2, int16_t *output,
94 int frames)
95 {
96 /* Process 4 frames (8 samples) each loop. */
97 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
98 int chunk = frames >> 2;
99 frames &= 3;
100
101 if (chunk) {
102 // clang-format off
103 __asm__ __volatile__(
104 "dup v2.4s, %w[scale] \n"
105 "1: \n"
106 "ld1 {v0.4s}, [%[input1]], #16 \n"
107 "ld1 {v1.4s}, [%[input2]], #16 \n"
108 "subs %w[chunk], %w[chunk], #1 \n"
109 "sqadd v0.4s, v0.4s, v2.4s \n"
110 "sqadd v1.4s, v1.4s, v2.4s \n"
111 "fcvtas v0.4s, v0.4s \n"
112 "fcvtas v1.4s, v1.4s \n"
113 "sqxtn v0.4h, v0.4s \n"
114 "sqxtn v1.4h, v1.4s \n"
115 "st2 {v0.4h, v1.4h}, [%[output]], #16 \n"
116 "b.ne 1b \n"
117 : /* output */
118 [chunk]"+r"(chunk),
119 [input1]"+r"(input1),
120 [input2]"+r"(input2),
121 [output]"+r"(output)
122 : /* input */
123 [scale]"r"(15 << 23)
124 : /* clobber */
125 "v0", "v1", "v2", "memory", "cc");
126 // clang-format on
127 }
128
129 /* The remaining samples */
130 while (frames--) {
131 float f;
132 f = *input1++ * 32768.0f;
133 f += (f >= 0) ? 0.5f : -0.5f;
134 *output++ = max(-32768, min(32767, (int)(f)));
135 f = *input2++ * 32768.0f;
136 f += (f >= 0) ? 0.5f : -0.5f;
137 *output++ = max(-32768, min(32767, (int)(f)));
138 }
139 }
140 #define interleave_stereo interleave_stereo
141 #endif
142
143 #ifdef __ARM_NEON__
144 #include <arm_neon.h>
145
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)146 static void deinterleave_stereo(int16_t *input, float *output1, float *output2,
147 int frames)
148 {
149 /* Process 8 frames (16 samples) each loop. */
150 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
151 int chunk = frames >> 3;
152 frames &= 7;
153 if (chunk) {
154 // clang-format off
155 __asm__ __volatile__(
156 "1: \n"
157 "vld2.16 {d0-d3}, [%[input]]! \n"
158 "subs %[chunk], #1 \n"
159 "vmovl.s16 q3, d3 \n"
160 "vmovl.s16 q2, d2 \n"
161 "vmovl.s16 q1, d1 \n"
162 "vmovl.s16 q0, d0 \n"
163 "vcvt.f32.s32 q3, q3, #15 \n"
164 "vcvt.f32.s32 q2, q2, #15 \n"
165 "vcvt.f32.s32 q1, q1, #15 \n"
166 "vcvt.f32.s32 q0, q0, #15 \n"
167 "vst1.32 {d4-d7}, [%[output2]]! \n"
168 "vst1.32 {d0-d3}, [%[output1]]! \n"
169 "bne 1b \n"
170 : /* output */
171 [chunk]"+r"(chunk),
172 [input]"+r"(input),
173 [output1]"+r"(output1),
174 [output2]"+r"(output2)
175 : /* input */
176 : /* clobber */
177 "q0", "q1", "q2", "q3", "memory", "cc");
178 // clang-format on
179 }
180
181 /* The remaining samples. */
182 while (frames--) {
183 *output1++ = *input++ / 32768.0f;
184 *output2++ = *input++ / 32768.0f;
185 }
186 }
187 #define deinterleave_stereo deinterleave_stereo
188
189 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
190 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
191 * from zero.
192 * Rounding is achieved by adding 0.5 or -0.5 adjusted for fixed point
193 * precision, and then converting float to fixed point using vcvt instruction
194 * which truncated toward zero.
195 * For very large values, beyond +/- 2 billion, vcvt will clamp the result
196 * to the min or max value that fits an int.
197 * For other values, vqmovn clamps the output to -32768 to 32767 range.
198 */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)199 static void interleave_stereo(float *input1, float *input2, int16_t *output,
200 int frames)
201 {
202 /* Process 4 frames (8 samples) each loop. */
203 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
204 float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
205 float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
206 int chunk = frames >> 2;
207 frames &= 3;
208
209 if (chunk) {
210 // clang-format off
211 __asm__ __volatile__(
212 "veor q0, q0, q0 \n"
213 "1: \n"
214 "vld1.32 {d2-d3}, [%[input1]]! \n"
215 "vld1.32 {d4-d5}, [%[input2]]! \n"
216 "subs %[chunk], #1 \n"
217 /* We try to round to the nearest number by adding 0.5
218 * to positive input, and adding -0.5 to the negative
219 * input, then truncate.
220 */
221 "vcgt.f32 q3, q1, q0 \n"
222 "vcgt.f32 q4, q2, q0 \n"
223 "vbsl q3, %q[pos], %q[neg] \n"
224 "vbsl q4, %q[pos], %q[neg] \n"
225 "vadd.f32 q1, q1, q3 \n"
226 "vadd.f32 q2, q2, q4 \n"
227 "vcvt.s32.f32 q1, q1, #15 \n"
228 "vcvt.s32.f32 q2, q2, #15 \n"
229 "vqmovn.s32 d2, q1 \n"
230 "vqmovn.s32 d3, q2 \n"
231 "vst2.16 {d2-d3}, [%[output]]! \n"
232 "bne 1b \n"
233 : /* output */
234 [chunk]"+r"(chunk),
235 [input1]"+r"(input1),
236 [input2]"+r"(input2),
237 [output]"+r"(output)
238 : /* input */
239 [pos]"w"(pos),
240 [neg]"w"(neg)
241 : /* clobber */
242 "q0", "q1", "q2", "q3", "q4", "memory", "cc");
243 // clang-format on
244 }
245
246 /* The remaining samples */
247 while (frames--) {
248 float f;
249 f = *input1++ * 32768.0f;
250 f += (f >= 0) ? 0.5f : -0.5f;
251 *output++ = max(-32768, min(32767, (int)(f)));
252 f = *input2++ * 32768.0f;
253 f += (f >= 0) ? 0.5f : -0.5f;
254 *output++ = max(-32768, min(32767, (int)(f)));
255 }
256 }
257 #define interleave_stereo interleave_stereo
258 #endif
259
260 #ifdef __SSE3__
261 #include <emmintrin.h>
262
263 /* Converts shorts in range of -32768 to 32767 to floats in range of
264 * -1.0f to 1.0f.
265 * pslld and psrad shifts are used to isolate the low and high word, but
266 * each in a different range:
267 * The low word is shifted to the high bits in range 0x80000000 .. 0x7fff0000.
268 * The high word is shifted to the low bits in range 0x00008000 .. 0x00007fff.
269 * cvtdq2ps converts ints to floats as is.
270 * mulps is used to normalize the range of the low and high words, adjusting
271 * for high and low words being in different range.
272 */
deinterleave_stereo(int16_t * input,float * output1,float * output2,int frames)273 static void deinterleave_stereo(int16_t *input, float *output1, float *output2,
274 int frames)
275 {
276 /* Process 8 frames (16 samples) each loop. */
277 /* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
278 int chunk = frames >> 3;
279 frames &= 7;
280 if (chunk) {
281 // clang-format off
282 __asm__ __volatile__(
283 "1: \n"
284 "lddqu (%[input]), %%xmm0 \n"
285 "lddqu 16(%[input]), %%xmm1 \n"
286 "add $32, %[input] \n"
287 "movdqa %%xmm0, %%xmm2 \n"
288 "movdqa %%xmm1, %%xmm3 \n"
289 "pslld $16, %%xmm0 \n"
290 "pslld $16, %%xmm1 \n"
291 "psrad $16, %%xmm2 \n"
292 "psrad $16, %%xmm3 \n"
293 "cvtdq2ps %%xmm0, %%xmm0 \n"
294 "cvtdq2ps %%xmm1, %%xmm1 \n"
295 "cvtdq2ps %%xmm2, %%xmm2 \n"
296 "cvtdq2ps %%xmm3, %%xmm3 \n"
297 "mulps %[scale_2_n31], %%xmm0 \n"
298 "mulps %[scale_2_n31], %%xmm1 \n"
299 "mulps %[scale_2_n15], %%xmm2 \n"
300 "mulps %[scale_2_n15], %%xmm3 \n"
301 "movdqu %%xmm0, (%[output1]) \n"
302 "movdqu %%xmm1, 16(%[output1]) \n"
303 "movdqu %%xmm2, (%[output2]) \n"
304 "movdqu %%xmm3, 16(%[output2]) \n"
305 "add $32, %[output1] \n"
306 "add $32, %[output2] \n"
307 "sub $1, %[chunk] \n"
308 "jnz 1b \n"
309 : /* output */
310 [chunk]"+r"(chunk),
311 [input]"+r"(input),
312 [output1]"+r"(output1),
313 [output2]"+r"(output2)
314 : /* input */
315 [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
316 [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
317 : /* clobber */
318 "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc");
319 // clang-format on
320 }
321
322 /* The remaining samples. */
323 while (frames--) {
324 *output1++ = *input++ / 32768.0f;
325 *output2++ = *input++ / 32768.0f;
326 }
327 }
328 #define deinterleave_stereo deinterleave_stereo
329
330 /* Converts floats in range of -1.0f to 1.0f to shorts in range of
331 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding to
332 * even.
333 * For very large values, beyond +/- 2 billion, cvtps2dq will produce
334 * 0x80000000 and packssdw will clamp -32768.
335 */
interleave_stereo(float * input1,float * input2,int16_t * output,int frames)336 static void interleave_stereo(float *input1, float *input2, int16_t *output,
337 int frames)
338 {
339 /* Process 4 frames (8 samples) each loop. */
340 /* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
341 int chunk = frames >> 2;
342 frames &= 3;
343
344 if (chunk) {
345 // clang-format off
346 __asm__ __volatile__(
347 "1: \n"
348 "lddqu (%[input1]), %%xmm0 \n"
349 "lddqu (%[input2]), %%xmm2 \n"
350 "add $16, %[input1] \n"
351 "add $16, %[input2] \n"
352 "movaps %%xmm0, %%xmm1 \n"
353 "unpcklps %%xmm2, %%xmm0 \n"
354 "unpckhps %%xmm2, %%xmm1 \n"
355 "paddsw %[scale_2_15], %%xmm0 \n"
356 "paddsw %[scale_2_15], %%xmm1 \n"
357 "cvtps2dq %%xmm0, %%xmm0 \n"
358 "cvtps2dq %%xmm1, %%xmm1 \n"
359 "packssdw %%xmm1, %%xmm0 \n"
360 "movdqu %%xmm0, (%[output]) \n"
361 "add $16, %[output] \n"
362 "sub $1, %[chunk] \n"
363 "jnz 1b \n"
364 : /* output */
365 [chunk]"+r"(chunk),
366 [input1]"+r"(input1),
367 [input2]"+r"(input2),
368 [output]"+r"(output)
369 : /* input */
370 [scale_2_15]"x"(_mm_set1_epi32(15 << 23)),
371 [clamp_large]"x"(_mm_set1_ps(32767.0f))
372 : /* clobber */
373 "xmm0", "xmm1", "xmm2", "memory", "cc");
374 // clang-format on
375 }
376
377 /* The remaining samples */
378 while (frames--) {
379 float f;
380 f = *input1++ * 32768.0f;
381 f += (f >= 0) ? 0.5f : -0.5f;
382 *output++ = max(-32768, min(32767, (int)(f)));
383 f = *input2++ * 32768.0f;
384 f += (f >= 0) ? 0.5f : -0.5f;
385 *output++ = max(-32768, min(32767, (int)(f)));
386 }
387 }
388 #define interleave_stereo interleave_stereo
389 #endif
390
dsp_util_deinterleave_s16le(int16_t * input,float * const * output,int channels,int frames)391 static void dsp_util_deinterleave_s16le(int16_t *input, float *const *output,
392 int channels, int frames)
393 {
394 float *output_ptr[channels];
395 int i, j;
396
397 #ifdef deinterleave_stereo
398 if (channels == 2) {
399 deinterleave_stereo(input, output[0], output[1], frames);
400 return;
401 }
402 #endif
403
404 for (i = 0; i < channels; i++)
405 output_ptr[i] = output[i];
406
407 for (i = 0; i < frames; i++)
408 for (j = 0; j < channels; j++)
409 *(output_ptr[j]++) = *input++ / 32768.0f;
410 }
411
dsp_util_deinterleave_s24le(int32_t * input,float * const * output,int channels,int frames)412 static void dsp_util_deinterleave_s24le(int32_t *input, float *const *output,
413 int channels, int frames)
414 {
415 float *output_ptr[channels];
416 int i, j;
417
418 for (i = 0; i < channels; i++)
419 output_ptr[i] = output[i];
420
421 for (i = 0; i < frames; i++)
422 for (j = 0; j < channels; j++, input++)
423 *(output_ptr[j]++) = (*input << 8) / 2147483648.0f;
424 }
425
dsp_util_deinterleave_s243le(uint8_t * input,float * const * output,int channels,int frames)426 static void dsp_util_deinterleave_s243le(uint8_t *input, float *const *output,
427 int channels, int frames)
428 {
429 float *output_ptr[channels];
430 int32_t sample;
431 int i, j;
432
433 for (i = 0; i < channels; i++)
434 output_ptr[i] = output[i];
435
436 for (i = 0; i < frames; i++)
437 for (j = 0; j < channels; j++, input += 3) {
438 sample = 0;
439 memcpy((uint8_t *)&sample + 1, input, 3);
440 *(output_ptr[j]++) = sample / 2147483648.0f;
441 }
442 }
443
dsp_util_deinterleave_s32le(int32_t * input,float * const * output,int channels,int frames)444 static void dsp_util_deinterleave_s32le(int32_t *input, float *const *output,
445 int channels, int frames)
446 {
447 float *output_ptr[channels];
448 int i, j;
449
450 for (i = 0; i < channels; i++)
451 output_ptr[i] = output[i];
452
453 for (i = 0; i < frames; i++)
454 for (j = 0; j < channels; j++, input++)
455 *(output_ptr[j]++) = *input / 2147483648.0f;
456 }
457
dsp_util_deinterleave(uint8_t * input,float * const * output,int channels,snd_pcm_format_t format,int frames)458 int dsp_util_deinterleave(uint8_t *input, float *const *output, int channels,
459 snd_pcm_format_t format, int frames)
460 {
461 switch (format) {
462 case SND_PCM_FORMAT_S16_LE:
463 dsp_util_deinterleave_s16le((int16_t *)input, output, channels,
464 frames);
465 break;
466 case SND_PCM_FORMAT_S24_LE:
467 dsp_util_deinterleave_s24le((int32_t *)input, output, channels,
468 frames);
469 break;
470 case SND_PCM_FORMAT_S24_3LE:
471 dsp_util_deinterleave_s243le(input, output, channels, frames);
472 break;
473 case SND_PCM_FORMAT_S32_LE:
474 dsp_util_deinterleave_s32le((int32_t *)input, output, channels,
475 frames);
476 break;
477 default:
478 syslog(LOG_ERR, "Invalid format to deinterleave");
479 return -EINVAL;
480 }
481 return 0;
482 }
483
dsp_util_interleave_s16le(float * const * input,int16_t * output,int channels,int frames)484 static void dsp_util_interleave_s16le(float *const *input, int16_t *output,
485 int channels, int frames)
486 {
487 float *input_ptr[channels];
488 int i, j;
489
490 #ifdef interleave_stereo
491 if (channels == 2) {
492 interleave_stereo(input[0], input[1], output, frames);
493 return;
494 }
495 #endif
496
497 for (i = 0; i < channels; i++)
498 input_ptr[i] = input[i];
499
500 for (i = 0; i < frames; i++)
501 for (j = 0; j < channels; j++) {
502 float f = *(input_ptr[j]++) * 32768.0f;
503 f += (f >= 0) ? 0.5f : -0.5f;
504 *output++ = max(-32768, min(32767, (int)(f)));
505 }
506 }
507
dsp_util_interleave_s24le(float * const * input,int32_t * output,int channels,int frames)508 static void dsp_util_interleave_s24le(float *const *input, int32_t *output,
509 int channels, int frames)
510 {
511 float *input_ptr[channels];
512 int i, j;
513
514 for (i = 0; i < channels; i++)
515 input_ptr[i] = input[i];
516
517 for (i = 0; i < frames; i++)
518 for (j = 0; j < channels; j++, output++) {
519 float f = *(input_ptr[j]++) * 2147483648.0f;
520 f += (f >= 0) ? 0.5f : -0.5f;
521 *output = max((float)INT_MIN, min((float)INT_MAX, f));
522 *output = (*output >> 8) & 0x00ffffff;
523 }
524 }
525
dsp_util_interleave_s243le(float * const * input,uint8_t * output,int channels,int frames)526 static void dsp_util_interleave_s243le(float *const *input, uint8_t *output,
527 int channels, int frames)
528 {
529 float *input_ptr[channels];
530 int i, j;
531 int32_t tmp;
532
533 for (i = 0; i < channels; i++)
534 input_ptr[i] = input[i];
535
536 for (i = 0; i < frames; i++)
537 for (j = 0; j < channels; j++, output += 3) {
538 float f = *(input_ptr[j]++) * 2147483648.0f;
539 f += (f >= 0) ? 0.5f : -0.5f;
540 tmp = max((float)INT_MIN, min((float)INT_MAX, f));
541 tmp >>= 8;
542 memcpy(output, &tmp, 3);
543 }
544 }
545
dsp_util_interleave_s32le(float * const * input,int32_t * output,int channels,int frames)546 static void dsp_util_interleave_s32le(float *const *input, int32_t *output,
547 int channels, int frames)
548 {
549 float *input_ptr[channels];
550 int i, j;
551
552 for (i = 0; i < channels; i++)
553 input_ptr[i] = input[i];
554
555 for (i = 0; i < frames; i++)
556 for (j = 0; j < channels; j++, output++) {
557 float f = *(input_ptr[j]++) * 2147483648.0f;
558 f += (f >= 0) ? 0.5f : -0.5f;
559 *output = max((float)INT_MIN, min((float)INT_MAX, f));
560 }
561 }
562
dsp_util_interleave(float * const * input,uint8_t * output,int channels,snd_pcm_format_t format,int frames)563 int dsp_util_interleave(float *const *input, uint8_t *output, int channels,
564 snd_pcm_format_t format, int frames)
565 {
566 switch (format) {
567 case SND_PCM_FORMAT_S16_LE:
568 dsp_util_interleave_s16le(input, (int16_t *)output, channels,
569 frames);
570 break;
571 case SND_PCM_FORMAT_S24_LE:
572 dsp_util_interleave_s24le(input, (int32_t *)output, channels,
573 frames);
574 break;
575 case SND_PCM_FORMAT_S24_3LE:
576 dsp_util_interleave_s243le(input, output, channels, frames);
577 break;
578 case SND_PCM_FORMAT_S32_LE:
579 dsp_util_interleave_s32le(input, (int32_t *)output, channels,
580 frames);
581 break;
582 default:
583 syslog(LOG_ERR, "Invalid format to interleave");
584 return -EINVAL;
585 }
586 return 0;
587 }
588
dsp_enable_flush_denormal_to_zero()589 void dsp_enable_flush_denormal_to_zero()
590 {
591 #if defined(__i386__) || defined(__x86_64__)
592 unsigned int mxcsr;
593 mxcsr = __builtin_ia32_stmxcsr();
594 __builtin_ia32_ldmxcsr(mxcsr | 0x8040);
595 #elif defined(__aarch64__)
596 uint64_t cw;
597 __asm__ __volatile__("mrs %0, fpcr \n"
598 "orr %0, %0, #0x1000000 \n"
599 "msr fpcr, %0 \n"
600 "isb \n"
601 : "=r"(cw)::"memory");
602 #elif defined(__arm__)
603 uint32_t cw;
604 __asm__ __volatile__("vmrs %0, fpscr \n"
605 "orr %0, %0, #0x1000000 \n"
606 "vmsr fpscr, %0 \n"
607 : "=r"(cw)::"memory");
608 #else
609 #warning "Don't know how to disable denorms. Performace may suffer."
610 #endif
611 }
612