1 /*-------------------------------------------------------------------------
2 * drawElements Base Portability Library
3 * -------------------------------------
4 *
5 * Copyright 2014 The Android Open Source Project
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *
19 *//*!
20 * \file
21 * \brief 16-bit floating-point math.
22 *//*--------------------------------------------------------------------*/
23
24 #include "deFloat16.h"
25
26 DE_BEGIN_EXTERN_C
27
deFloat32To16(float val32)28 deFloat16 deFloat32To16 (float val32)
29 {
30 deUint32 sign;
31 int expotent;
32 deUint32 mantissa;
33 union
34 {
35 float f;
36 deUint32 u;
37 } x;
38
39 x.f = val32;
40 sign = (x.u >> 16u) & 0x00008000u;
41 expotent = (int)((x.u >> 23u) & 0x000000ffu) - (127 - 15);
42 mantissa = x.u & 0x007fffffu;
43
44 if (expotent <= 0)
45 {
46 if (expotent < -10)
47 {
48 /* Rounds to zero. */
49 return (deFloat16) sign;
50 }
51
52 /* Converted to denormalized half, add leading 1 to significand. */
53 mantissa = mantissa | 0x00800000u;
54
55 /* Round mantissa to nearest (10+e) */
56 {
57 deUint32 t = 14u - expotent;
58 deUint32 a = (1u << (t - 1u)) - 1u;
59 deUint32 b = (mantissa >> t) & 1u;
60
61 mantissa = (mantissa + a + b) >> t;
62 }
63
64 return (deFloat16) (sign | mantissa);
65 }
66 else if (expotent == 0xff - (127 - 15))
67 {
68 if (mantissa == 0u)
69 {
70 /* InF */
71 return (deFloat16) (sign | 0x7c00u);
72 }
73 else
74 {
75 /* NaN */
76 mantissa >>= 13u;
77 return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u));
78 }
79 }
80 else
81 {
82 /* Normalized float. */
83 mantissa = mantissa + 0x00000fffu + ((mantissa >> 13u) & 1u);
84
85 if (mantissa & 0x00800000u)
86 {
87 /* Overflow in mantissa. */
88 mantissa = 0u;
89 expotent += 1;
90 }
91
92 if (expotent > 30)
93 {
94 /* \todo [pyry] Cause hw fp overflow */
95 return (deFloat16) (sign | 0x7c00u);
96 }
97
98 return (deFloat16) (sign | ((deUint32)expotent << 10u) | (mantissa >> 13u));
99 }
100 }
101
deFloat64To16(double val64)102 deFloat16 deFloat64To16 (double val64)
103 {
104 deUint64 sign;
105 long expotent;
106 deUint64 mantissa;
107 union
108 {
109 double f;
110 deUint64 u;
111 } x;
112
113 x.f = val64;
114 sign = (x.u >> 48u) & 0x00008000u;
115 expotent = (long int)((x.u >> 52u) & 0x000007ffu) - (1023 - 15);
116 mantissa = x.u & 0x00fffffffffffffu;
117
118 if (expotent <= 0)
119 {
120 if (expotent < -10)
121 {
122 /* Rounds to zero. */
123 return (deFloat16) sign;
124 }
125
126 /* Converted to denormalized half, add leading 1 to significand. */
127 mantissa = mantissa | 0x0010000000000000u;
128
129 /* Round mantissa to nearest (10+e) */
130 {
131 deUint64 t = 43u - expotent;
132 deUint64 a = (1u << (t - 1u)) - 1u;
133 deUint64 b = (mantissa >> t) & 1u;
134
135 mantissa = (mantissa + a + b) >> t;
136 }
137
138 return (deFloat16) (sign | mantissa);
139 }
140 else if (expotent == 0x7ff - (1023 - 15))
141 {
142 if (mantissa == 0u)
143 {
144 /* InF */
145 return (deFloat16) (sign | 0x7c00u);
146 }
147 else
148 {
149 /* NaN */
150 mantissa >>= 42u;
151 return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u));
152 }
153 }
154 else
155 {
156 /* Normalized float. */
157 mantissa = mantissa + 0x000001ffffffffffu + ((mantissa >> 42u) & 1u);
158
159 if (mantissa & 0x010000000000000u)
160 {
161 /* Overflow in mantissa. */
162 mantissa = 0u;
163 expotent += 1;
164 }
165
166 if (expotent > 30)
167 {
168 return (deFloat16) (sign | 0x7c00u);
169 }
170
171 return (deFloat16) (sign | ((deUint32)expotent << 10u) | (mantissa >> 13u));
172 }
173 }
174
175 /*--------------------------------------------------------------------*//*!
176 * \brief Round the given number `val` to nearest even by discarding
177 * the last `numBitsToDiscard` bits.
178 * \param val value to round
179 * \param numBitsToDiscard number of (least significant) bits to discard
180 * \return The rounded value with the last `numBitsToDiscard` removed
181 *//*--------------------------------------------------------------------*/
roundToNearestEven(deUint32 val,const deUint32 numBitsToDiscard)182 static deUint32 roundToNearestEven (deUint32 val, const deUint32 numBitsToDiscard)
183 {
184 const deUint32 lastBits = val & ((1 << numBitsToDiscard) - 1);
185 const deUint32 headBit = val & (1 << (numBitsToDiscard - 1));
186
187 DE_ASSERT(numBitsToDiscard > 0 && numBitsToDiscard < 32); /* Make sure no overflow. */
188 val >>= numBitsToDiscard;
189
190 if (headBit == 0)
191 {
192 return val;
193 }
194 else if (headBit == lastBits)
195 {
196 if ((val & 0x1) == 0x1)
197 {
198 return val + 1;
199 }
200 else
201 {
202 return val;
203 }
204 }
205 else
206 {
207 return val + 1;
208 }
209 }
210
deFloat32To16Round(float val32,deRoundingMode mode)211 deFloat16 deFloat32To16Round (float val32, deRoundingMode mode)
212 {
213 union
214 {
215 float f; /* Interpret as 32-bit float */
216 deUint32 u; /* Interpret as 32-bit unsigned integer */
217 } x;
218 deUint32 sign; /* sign : 0000 0000 0000 0000 X000 0000 0000 0000 */
219 deUint32 exp32; /* exp32: biased exponent for 32-bit floats */
220 int exp16; /* exp16: biased exponent for 16-bit floats */
221 deUint32 mantissa;
222
223 /* We only support these two rounding modes for now */
224 DE_ASSERT(mode == DE_ROUNDINGMODE_TO_ZERO || mode == DE_ROUNDINGMODE_TO_NEAREST_EVEN);
225
226 x.f = val32;
227 sign = (x.u >> 16u) & 0x00008000u;
228 exp32 = (x.u >> 23u) & 0x000000ffu;
229 exp16 = (int) (exp32) - 127 + 15; /* 15/127: exponent bias for 16-bit/32-bit floats */
230 mantissa = x.u & 0x007fffffu;
231
232 /* Case: zero and denormalized floats */
233 if (exp32 == 0)
234 {
235 /* Denormalized floats are < 2^(1-127), not representable in 16-bit floats, rounding to zero. */
236 return (deFloat16) sign;
237 }
238 /* Case: Inf and NaN */
239 else if (exp32 == 0x000000ffu)
240 {
241 if (mantissa == 0u)
242 {
243 /* Inf */
244 return (deFloat16) (sign | 0x7c00u);
245 }
246 else
247 {
248 /* NaN */
249 mantissa >>= 13u; /* 16-bit floats has 10-bit for mantissa, 13-bit less than 32-bit floats. */
250 /* Make sure we don't turn NaN into zero by | (mantissa == 0). */
251 return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u));
252 }
253 }
254 /* The following are cases for normalized floats.
255 *
256 * * If exp16 is less than 0, we are experiencing underflow for the exponent. To encode this underflowed exponent,
257 * we can only shift the mantissa further right.
258 * The real exponent is exp16 - 15. A denormalized 16-bit float can represent -14 via its exponent.
259 * Note that the most significant bit in the mantissa of a denormalized float is already -1 as for exponent.
260 * So, we just need to right shift the mantissa -exp16 bits.
261 * * If exp16 is 0, mantissa shifting requirement is similar to the above.
262 * * If exp16 is greater than 30 (0b11110), we are experiencing overflow for the exponent of 16-bit normalized floats.
263 */
264 /* Case: normalized floats -> zero */
265 else if (exp16 < -10)
266 {
267 /* 16-bit floats have only 10 bits for mantissa. Minimal 16-bit denormalized float is (2^-10) * (2^-14). */
268 /* Expecting a number < (2^-10) * (2^-14) here, not representable, round to zero. */
269 return (deFloat16) sign;
270 }
271 /* Case: normalized floats -> zero and denormalized halfs */
272 else if (exp16 <= 0)
273 {
274 /* Add the implicit leading 1 in mormalized float to mantissa. */
275 mantissa |= 0x00800000u;
276 /* We have a (23 + 1)-bit mantissa, but 16-bit floats only expect 10-bit mantissa.
277 * Need to discard the last 14-bits considering rounding mode.
278 * We also need to shift right -exp16 bits to encode the underflowed exponent.
279 */
280 if (mode == DE_ROUNDINGMODE_TO_ZERO)
281 {
282 mantissa >>= (14 - exp16);
283 }
284 else
285 {
286 /* mantissa in the above may exceed 10-bits, in which case overflow happens.
287 * The overflowed bit is automatically carried to exponent then.
288 */
289 mantissa = roundToNearestEven(mantissa, 14 - exp16);
290 }
291 return (deFloat16) (sign | mantissa);
292 }
293 /* Case: normalized floats -> normalized floats */
294 else if (exp16 <= 30)
295 {
296 if (mode == DE_ROUNDINGMODE_TO_ZERO)
297 {
298 return (deFloat16) (sign | ((deUint32)exp16 << 10u) | (mantissa >> 13u));
299 }
300 else
301 {
302 mantissa = roundToNearestEven(mantissa, 13);
303 /* Handle overflow. exp16 may overflow (and become Inf) itself, but that's correct. */
304 exp16 = (exp16 << 10u) + (mantissa & (1 << 10));
305 mantissa &= (1u << 10) - 1;
306 return (deFloat16) (sign | ((deUint32) exp16) | mantissa);
307 }
308 }
309 /* Case: normalized floats (too large to be representable as 16-bit floats) */
310 else
311 {
312 /* According to IEEE Std 754-2008 Section 7.4,
313 * * roundTiesToEven and roundTiesToAway carry all overflows to Inf with the sign
314 * of the intermediate result.
315 * * roundTowardZero carries all overflows to the format’s largest finite number
316 * with the sign of the intermediate result.
317 */
318 if (mode == DE_ROUNDINGMODE_TO_ZERO)
319 {
320 return (deFloat16) (sign | 0x7bffu); /* 111 1011 1111 1111 */
321 }
322 else
323 {
324 return (deFloat16) (sign | (0x1f << 10));
325 }
326 }
327
328 /* Make compiler happy */
329 return (deFloat16) 0;
330 }
331
332 /*--------------------------------------------------------------------*//*!
333 * \brief Round the given number `val` to nearest even by discarding
334 * the last `numBitsToDiscard` bits.
335 * \param val value to round
336 * \param numBitsToDiscard number of (least significant) bits to discard
337 * \return The rounded value with the last `numBitsToDiscard` removed
338 *//*--------------------------------------------------------------------*/
roundToNearestEven64(deUint64 val,const deUint64 numBitsToDiscard)339 static deUint64 roundToNearestEven64 (deUint64 val, const deUint64 numBitsToDiscard)
340 {
341 const deUint64 lastBits = val & (((deUint64)1 << numBitsToDiscard) - 1);
342 const deUint64 headBit = val & ((deUint64)1 << (numBitsToDiscard - 1));
343
344 DE_ASSERT(numBitsToDiscard > 0 && numBitsToDiscard < 64); /* Make sure no overflow. */
345 val >>= numBitsToDiscard;
346
347 if (headBit == 0)
348 {
349 return val;
350 }
351 else if (headBit == lastBits)
352 {
353 if ((val & 0x1) == 0x1)
354 {
355 return val + 1;
356 }
357 else
358 {
359 return val;
360 }
361 }
362 else
363 {
364 return val + 1;
365 }
366 }
367
deFloat64To16Round(double val64,deRoundingMode mode)368 deFloat16 deFloat64To16Round (double val64, deRoundingMode mode)
369 {
370 union
371 {
372 double f; /* Interpret as 64-bit float */
373 deUint64 u; /* Interpret as 64-bit unsigned integer */
374 } x;
375 deUint64 sign; /* sign : 0000 0000 0000 0000 X000 0000 0000 0000 */
376 deUint64 exp64; /* exp32: biased exponent for 64-bit floats */
377 int exp16; /* exp16: biased exponent for 16-bit floats */
378 deUint64 mantissa;
379
380 /* We only support these two rounding modes for now */
381 DE_ASSERT(mode == DE_ROUNDINGMODE_TO_ZERO || mode == DE_ROUNDINGMODE_TO_NEAREST_EVEN);
382
383 x.f = val64;
384 sign = (x.u >> 48u) & 0x00008000u;
385 exp64 = (x.u >> 52u) & 0x000007ffu;
386 exp16 = (int) (exp64) - 1023 + 15; /* 15/127: exponent bias for 16-bit/32-bit floats */
387 mantissa = x.u & 0x00fffffffffffffu;
388
389 /* Case: zero and denormalized floats */
390 if (exp64 == 0)
391 {
392 /* Denormalized floats are < 2^(1-1023), not representable in 16-bit floats, rounding to zero. */
393 return (deFloat16) sign;
394 }
395 /* Case: Inf and NaN */
396 else if (exp64 == 0x000007ffu)
397 {
398 if (mantissa == 0u)
399 {
400 /* Inf */
401 return (deFloat16) (sign | 0x7c00u);
402 }
403 else
404 {
405 /* NaN */
406 mantissa >>= 42u; /* 16-bit floats has 10-bit for mantissa, 42-bit less than 64-bit floats. */
407 /* Make sure we don't turn NaN into zero by | (mantissa == 0). */
408 return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u));
409 }
410 }
411 /* The following are cases for normalized floats.
412 *
413 * * If exp16 is less than 0, we are experiencing underflow for the exponent. To encode this underflowed exponent,
414 * we can only shift the mantissa further right.
415 * The real exponent is exp16 - 15. A denormalized 16-bit float can represent -14 via its exponent.
416 * Note that the most significant bit in the mantissa of a denormalized float is already -1 as for exponent.
417 * So, we just need to right shift the mantissa -exp16 bits.
418 * * If exp16 is 0, mantissa shifting requirement is similar to the above.
419 * * If exp16 is greater than 30 (0b11110), we are experiencing overflow for the exponent of 16-bit normalized floats.
420 */
421 /* Case: normalized floats -> zero */
422 else if (exp16 < -10)
423 {
424 /* 16-bit floats have only 10 bits for mantissa. Minimal 16-bit denormalized float is (2^-10) * (2^-14). */
425 /* Expecting a number < (2^-10) * (2^-14) here, not representable, round to zero. */
426 return (deFloat16) sign;
427 }
428 /* Case: normalized floats -> zero and denormalized halfs */
429 else if (exp16 <= 0)
430 {
431 /* Add the implicit leading 1 in mormalized float to mantissa. */
432 mantissa |= 0x0010000000000000u;
433 /* We have a (23 + 1)-bit mantissa, but 16-bit floats only expect 10-bit mantissa.
434 * Need to discard the last 14-bits considering rounding mode.
435 * We also need to shift right -exp16 bits to encode the underflowed exponent.
436 */
437 if (mode == DE_ROUNDINGMODE_TO_ZERO)
438 {
439 mantissa >>= (43 - exp16);
440 }
441 else
442 {
443 /* mantissa in the above may exceed 10-bits, in which case overflow happens.
444 * The overflowed bit is automatically carried to exponent then.
445 */
446 mantissa = roundToNearestEven64(mantissa, 43 - exp16);
447 }
448 return (deFloat16) (sign | mantissa);
449 }
450 /* Case: normalized floats -> normalized floats */
451 else if (exp16 <= 30)
452 {
453 if (mode == DE_ROUNDINGMODE_TO_ZERO)
454 {
455 return (deFloat16) (sign | ((deUint32)exp16 << 10u) | (mantissa >> 42u));
456 }
457 else
458 {
459 mantissa = roundToNearestEven64(mantissa, 42);
460 /* Handle overflow. exp16 may overflow (and become Inf) itself, but that's correct. */
461 exp16 = (exp16 << 10u) + (deFloat16)(mantissa & (1 << 10));
462 mantissa &= (1u << 10) - 1;
463 return (deFloat16) (sign | ((deUint32) exp16) | mantissa);
464 }
465 }
466 /* Case: normalized floats (too large to be representable as 16-bit floats) */
467 else
468 {
469 /* According to IEEE Std 754-2008 Section 7.4,
470 * * roundTiesToEven and roundTiesToAway carry all overflows to Inf with the sign
471 * of the intermediate result.
472 * * roundTowardZero carries all overflows to the format’s largest finite number
473 * with the sign of the intermediate result.
474 */
475 if (mode == DE_ROUNDINGMODE_TO_ZERO)
476 {
477 return (deFloat16) (sign | 0x7bffu); /* 111 1011 1111 1111 */
478 }
479 else
480 {
481 return (deFloat16) (sign | (0x1f << 10));
482 }
483 }
484
485 /* Make compiler happy */
486 return (deFloat16) 0;
487 }
488
deFloat16To32(deFloat16 val16)489 float deFloat16To32 (deFloat16 val16)
490 {
491 deUint32 sign;
492 deUint32 expotent;
493 deUint32 mantissa;
494 union
495 {
496 float f;
497 deUint32 u;
498 } x;
499
500 x.u = 0u;
501
502 sign = ((deUint32)val16 >> 15u) & 0x00000001u;
503 expotent = ((deUint32)val16 >> 10u) & 0x0000001fu;
504 mantissa = (deUint32)val16 & 0x000003ffu;
505
506 if (expotent == 0u)
507 {
508 if (mantissa == 0u)
509 {
510 /* +/- 0 */
511 x.u = sign << 31u;
512 return x.f;
513 }
514 else
515 {
516 /* Denormalized, normalize it. */
517
518 while (!(mantissa & 0x00000400u))
519 {
520 mantissa <<= 1u;
521 expotent -= 1u;
522 }
523
524 expotent += 1u;
525 mantissa &= ~0x00000400u;
526 }
527 }
528 else if (expotent == 31u)
529 {
530 if (mantissa == 0u)
531 {
532 /* +/- InF */
533 x.u = (sign << 31u) | 0x7f800000u;
534 return x.f;
535 }
536 else
537 {
538 /* +/- NaN */
539 x.u = (sign << 31u) | 0x7f800000u | (mantissa << 13u);
540 return x.f;
541 }
542 }
543
544 expotent = expotent + (127u - 15u);
545 mantissa = mantissa << 13u;
546
547 x.u = (sign << 31u) | (expotent << 23u) | mantissa;
548 return x.f;
549 }
550
deFloat16To64(deFloat16 val16)551 double deFloat16To64 (deFloat16 val16)
552 {
553 deUint64 sign;
554 deUint64 expotent;
555 deUint64 mantissa;
556 union
557 {
558 double f;
559 deUint64 u;
560 } x;
561
562 x.u = 0u;
563
564 sign = ((deUint32)val16 >> 15u) & 0x00000001u;
565 expotent = ((deUint32)val16 >> 10u) & 0x0000001fu;
566 mantissa = (deUint32)val16 & 0x000003ffu;
567
568 if (expotent == 0u)
569 {
570 if (mantissa == 0u)
571 {
572 /* +/- 0 */
573 x.u = sign << 63u;
574 return x.f;
575 }
576 else
577 {
578 /* Denormalized, normalize it. */
579
580 while (!(mantissa & 0x00000400u))
581 {
582 mantissa <<= 1u;
583 expotent -= 1u;
584 }
585
586 expotent += 1u;
587 mantissa &= ~0x00000400u;
588 }
589 }
590 else if (expotent == 31u)
591 {
592 if (mantissa == 0u)
593 {
594 /* +/- InF */
595 x.u = (sign << 63u) | 0x7ff0000000000000u;
596 return x.f;
597 }
598 else
599 {
600 /* +/- NaN */
601 x.u = (sign << 63u) | 0x7ff0000000000000u | (mantissa << 42u);
602 return x.f;
603 }
604 }
605
606 expotent = expotent + (1023u - 15u);
607 mantissa = mantissa << 42u;
608
609 x.u = (sign << 63u) | (expotent << 52u) | mantissa;
610 return x.f;
611 }
612
613 DE_END_EXTERN_C
614