1 #ifndef _TCUFLOAT_HPP
2 #define _TCUFLOAT_HPP
3 /*-------------------------------------------------------------------------
4 * drawElements Quality Program Tester Core
5 * ----------------------------------------
6 *
7 * Copyright 2014 The Android Open Source Project
8 *
9 * Licensed under the Apache License, Version 2.0 (the "License");
10 * you may not use this file except in compliance with the License.
11 * You may obtain a copy of the License at
12 *
13 * http://www.apache.org/licenses/LICENSE-2.0
14 *
15 * Unless required by applicable law or agreed to in writing, software
16 * distributed under the License is distributed on an "AS IS" BASIS,
17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 * See the License for the specific language governing permissions and
19 * limitations under the License.
20 *
21 *//*!
22 * \file
23 * \brief Reconfigurable floating-point value template.
24 *//*--------------------------------------------------------------------*/
25
26 #include "tcuDefs.hpp"
27
28 // For memcpy().
29 #include <limits>
30 #include <string.h>
31
32 namespace tcu
33 {
34
35 enum FloatFlags
36 {
37 FLOAT_HAS_SIGN = (1 << 0),
38 FLOAT_SUPPORT_DENORM = (1 << 1)
39 };
40
41 enum RoundingDirection
42 {
43 ROUND_TO_EVEN = 0,
44 ROUND_DOWNWARD, // Towards -Inf.
45 ROUND_UPWARD, // Towards +Inf.
46 ROUND_TO_ZERO
47 };
48
49 /*--------------------------------------------------------------------*//*!
50 * \brief Floating-point format template
51 *
52 * This template implements arbitrary floating-point handling. Template
53 * can be used for conversion between different formats and checking
54 * various properties of floating-point values.
55 *//*--------------------------------------------------------------------*/
56 template <typename StorageType_, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
57 class Float
58 {
59 public:
60 typedef StorageType_ StorageType;
61
62 enum
63 {
64 EXPONENT_BITS = ExponentBits,
65 MANTISSA_BITS = MantissaBits,
66 EXPONENT_BIAS = ExponentBias,
67 FLAGS = Flags,
68 };
69
70 Float(void);
71 explicit Float(StorageType value);
72 explicit Float(float v, RoundingDirection rd = ROUND_TO_EVEN);
73 explicit Float(double v, RoundingDirection rd = ROUND_TO_EVEN);
74
75 template <typename OtherStorageType, int OtherExponentBits, int OtherMantissaBits, int OtherExponentBias,
76 uint32_t OtherFlags>
77 static Float convert(
78 const Float<OtherStorageType, OtherExponentBits, OtherMantissaBits, OtherExponentBias, OtherFlags> &src,
79 RoundingDirection rd = ROUND_TO_EVEN);
80
convert(const Float<StorageType,ExponentBits,MantissaBits,ExponentBias,Flags> & src,RoundingDirection=ROUND_TO_EVEN)81 static inline Float convert(const Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> &src,
82 RoundingDirection = ROUND_TO_EVEN)
83 {
84 return src;
85 }
86
87 /*--------------------------------------------------------------------*//*!
88 * \brief Construct floating point value
89 * \param sign Sign. Must be +1/-1
90 * \param exponent Exponent in range [1-ExponentBias, ExponentBias+1]
91 * \param mantissa Mantissa bits with implicit leading bit explicitly set
92 * \return The specified float
93 *
94 * This function constructs a floating point value from its inputs.
95 * The normally implicit leading bit of the mantissa must be explicitly set.
96 * The exponent normally used for zero/subnormals is an invalid input. Such
97 * values are specified with the leading mantissa bit of zero and the lowest
98 * normal exponent (1-ExponentBias). Additionally having both exponent and
99 * mantissa set to zero is a shorthand notation for the correctly signed
100 * floating point zero. Inf and NaN must be specified directly with an
101 * exponent of ExponentBias+1 and the appropriate mantissa (with leading
102 * bit set)
103 *//*--------------------------------------------------------------------*/
104 static inline Float construct(int sign, int exponent, StorageType mantissa);
105
106 /*--------------------------------------------------------------------*//*!
107 * \brief Construct floating point value. Explicit version
108 * \param sign Sign. Must be +1/-1
109 * \param exponent Exponent in range [-ExponentBias, ExponentBias+1]
110 * \param mantissa Mantissa bits
111 * \return The specified float
112 *
113 * This function constructs a floating point value from its inputs with
114 * minimal intervention.
115 * The sign is turned into a sign bit and the exponent bias is added.
116 * See IEEE-754 for additional information on the inputs and
117 * the encoding of special values.
118 *//*--------------------------------------------------------------------*/
119 static Float constructBits(int sign, int exponent, StorageType mantissaBits);
120
bits(void) const121 StorageType bits(void) const
122 {
123 return m_value;
124 }
125 float asFloat(void) const;
126 double asDouble(void) const;
127
signBit(void) const128 inline int signBit(void) const
129 {
130 return (int)(m_value >> (ExponentBits + MantissaBits)) & 1;
131 }
exponentBits(void) const132 inline StorageType exponentBits(void) const
133 {
134 return (m_value >> MantissaBits) & ((StorageType(1) << ExponentBits) - 1);
135 }
mantissaBits(void) const136 inline StorageType mantissaBits(void) const
137 {
138 return m_value & ((StorageType(1) << MantissaBits) - 1);
139 }
140
sign(void) const141 inline int sign(void) const
142 {
143 return signBit() ? -1 : 1;
144 }
exponent(void) const145 inline int exponent(void) const
146 {
147 return isDenorm() ? 1 - ExponentBias : (int)exponentBits() - ExponentBias;
148 }
mantissa(void) const149 inline StorageType mantissa(void) const
150 {
151 return isZero() || isDenorm() ? mantissaBits() : (mantissaBits() | (StorageType(1) << MantissaBits));
152 }
153
isInf(void) const154 inline bool isInf(void) const
155 {
156 return exponentBits() == ((1 << ExponentBits) - 1) && mantissaBits() == 0;
157 }
isNaN(void) const158 inline bool isNaN(void) const
159 {
160 return exponentBits() == ((1 << ExponentBits) - 1) && mantissaBits() != 0;
161 }
isZero(void) const162 inline bool isZero(void) const
163 {
164 return exponentBits() == 0 && mantissaBits() == 0;
165 }
isDenorm(void) const166 inline bool isDenorm(void) const
167 {
168 return exponentBits() == 0 && mantissaBits() != 0;
169 }
170
operator <(const Float<StorageType,ExponentBits,MantissaBits,ExponentBias,Flags> & other) const171 inline bool operator<(const Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> &other) const
172 {
173 return this->asDouble() < other.asDouble();
174 }
175
176 static Float zero(int sign);
177 static Float inf(int sign);
178 static Float nan(void);
179
180 static Float largestNormal(int sign);
181 static Float smallestNormal(int sign);
182
183 private:
184 StorageType m_value;
185 } DE_WARN_UNUSED_TYPE;
186
187 // Common floating-point types.
188 typedef Float<uint16_t, 5, 10, 15, FLOAT_HAS_SIGN | FLOAT_SUPPORT_DENORM>
189 Float16; //!< IEEE 754-2008 16-bit floating-point value
190 typedef Float<uint32_t, 8, 23, 127, FLOAT_HAS_SIGN | FLOAT_SUPPORT_DENORM>
191 Float32; //!< IEEE 754 32-bit floating-point value
192 typedef Float<uint64_t, 11, 52, 1023, FLOAT_HAS_SIGN | FLOAT_SUPPORT_DENORM>
193 Float64; //!< IEEE 754 64-bit floating-point value
194
195 typedef Float<uint16_t, 5, 10, 15, FLOAT_HAS_SIGN>
196 Float16Denormless; //!< IEEE 754-2008 16-bit floating-point value without denormalized support
197
198 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(void)199 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(void) : m_value(0)
200 {
201 }
202
203 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(StorageType value)204 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(StorageType value) : m_value(value)
205 {
206 }
207
208 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(float value,RoundingDirection rd)209 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(float value, RoundingDirection rd)
210 : m_value(0)
211 {
212 uint32_t u32;
213 memcpy(&u32, &value, sizeof(uint32_t));
214 *this = convert(Float32(u32), rd);
215 }
216
217 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
Float(double value,RoundingDirection rd)218 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::Float(double value, RoundingDirection rd)
219 : m_value(0)
220 {
221 uint64_t u64;
222 memcpy(&u64, &value, sizeof(uint64_t));
223 *this = convert(Float64(u64), rd);
224 }
225
226 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
asFloat(void) const227 inline float Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::asFloat(void) const
228 {
229 float v;
230 uint32_t u32 = Float32::convert(*this).bits();
231 memcpy(&v, &u32, sizeof(uint32_t));
232 return v;
233 }
234
235 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
asDouble(void) const236 inline double Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::asDouble(void) const
237 {
238 double v;
239 uint64_t u64 = Float64::convert(*this).bits();
240 memcpy(&v, &u64, sizeof(uint64_t));
241 return v;
242 }
243
244 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
245 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
zero(int sign)246 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::zero(int sign)
247 {
248 DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
249 return Float(StorageType((sign > 0 ? 0ull : 1ull) << (ExponentBits + MantissaBits)));
250 }
251
252 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
253 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
inf(int sign)254 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::inf(int sign)
255 {
256 DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
257 return Float(StorageType(((sign > 0 ? 0ull : 1ull) << (ExponentBits + MantissaBits)) |
258 (((1ull << ExponentBits) - 1) << MantissaBits)));
259 }
260
261 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
262 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
nan(void)263 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::nan(void)
264 {
265 return Float(StorageType((1ull << (ExponentBits + MantissaBits)) - 1));
266 }
267
268 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
269 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
largestNormal(int sign)270 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::largestNormal(int sign)
271 {
272 DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
273 return Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct(
274 sign, ExponentBias, (static_cast<StorageType>(1) << (MantissaBits + 1)) - 1);
275 }
276
277 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
278 inline Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
smallestNormal(int sign)279 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::smallestNormal(int sign)
280 {
281 DE_ASSERT(sign == 1 || ((Flags & FLOAT_HAS_SIGN) && sign == -1));
282 return Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct(
283 sign, 1 - ExponentBias, (static_cast<StorageType>(1) << MantissaBits));
284 }
285
286 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
287 Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
construct(int sign,int exponent,StorageType mantissa)288 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::construct(int sign, int exponent,
289 StorageType mantissa)
290 {
291 // Repurpose this otherwise invalid input as a shorthand notation for zero (no need for caller to care about internal representation)
292 const bool isShorthandZero = exponent == 0 && mantissa == 0;
293
294 // Handles the typical notation for zero (min exponent, mantissa 0). Note that the exponent usually used exponent (-ExponentBias) for zero/subnormals is not used.
295 // Instead zero/subnormals have the (normally implicit) leading mantissa bit set to zero.
296 const bool isDenormOrZero = (exponent == 1 - ExponentBias) && (mantissa >> MantissaBits == 0);
297 const StorageType s = StorageType((StorageType(sign < 0 ? 1 : 0)) << (StorageType(ExponentBits + MantissaBits)));
298 const StorageType exp = (isShorthandZero || isDenormOrZero) ? StorageType(0) : StorageType(exponent + ExponentBias);
299
300 DE_ASSERT(sign == +1 || sign == -1);
301 DE_ASSERT(isShorthandZero || isDenormOrZero || mantissa >> MantissaBits == 1);
302 DE_ASSERT(exp >> ExponentBits == 0);
303
304 return Float(StorageType(s | (exp << MantissaBits) | (mantissa & ((StorageType(1) << MantissaBits) - 1))));
305 }
306
307 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
308 Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<
constructBits(int sign,int exponent,StorageType mantissaBits)309 StorageType, ExponentBits, MantissaBits, ExponentBias, Flags>::constructBits(int sign, int exponent,
310 StorageType mantissaBits)
311 {
312 const StorageType signBit = static_cast<StorageType>(sign < 0 ? 1 : 0);
313 const StorageType exponentBits = static_cast<StorageType>(exponent + ExponentBias);
314
315 DE_ASSERT(sign == +1 || sign == -1);
316 DE_ASSERT(exponentBits >> ExponentBits == 0);
317 DE_ASSERT(mantissaBits >> MantissaBits == 0);
318
319 return Float(
320 StorageType((signBit << (ExponentBits + MantissaBits)) | (exponentBits << MantissaBits) | (mantissaBits)));
321 }
322
323 template <typename StorageType, int ExponentBits, int MantissaBits, int ExponentBias, uint32_t Flags>
324 template <typename OtherStorageType, int OtherExponentBits, int OtherMantissaBits, int OtherExponentBias,
325 uint32_t OtherFlags>
326 Float<StorageType, ExponentBits, MantissaBits, ExponentBias, Flags> Float<StorageType, ExponentBits, MantissaBits,
327 ExponentBias, Flags>::
convert(const Float<OtherStorageType,OtherExponentBits,OtherMantissaBits,OtherExponentBias,OtherFlags> & other,RoundingDirection rd)328 convert(const Float<OtherStorageType, OtherExponentBits, OtherMantissaBits, OtherExponentBias, OtherFlags> &other,
329 RoundingDirection rd)
330 {
331 if (!(Flags & FLOAT_HAS_SIGN) && other.sign() < 0)
332 {
333 // Negative number, truncate to zero.
334 return zero(+1);
335 }
336
337 if (other.isInf())
338 {
339 return inf(other.sign());
340 }
341
342 if (other.isNaN())
343 {
344 return nan();
345 }
346
347 if (other.isZero())
348 {
349 return zero(other.sign());
350 }
351
352 const int eMin = 1 - ExponentBias;
353 const int eMax = ((1 << ExponentBits) - 2) - ExponentBias;
354
355 const StorageType s = StorageType((StorageType(other.signBit()))
356 << (StorageType(ExponentBits + MantissaBits))); // \note Not sign, but sign bit.
357 int e = other.exponent();
358 uint64_t m = other.mantissa();
359
360 // Normalize denormalized values prior to conversion.
361 while (!(m & (1ull << OtherMantissaBits)))
362 {
363 m <<= 1;
364 e -= 1;
365 }
366
367 if (e < eMin)
368 {
369 // Underflow.
370 if ((Flags & FLOAT_SUPPORT_DENORM) && (eMin - e - 1 <= MantissaBits))
371 {
372 // Shift and round.
373 int bitDiff = (OtherMantissaBits - MantissaBits) + (eMin - e);
374 uint64_t lastBitsMask = (1ull << bitDiff) - 1ull;
375 uint64_t lastBits = (static_cast<uint64_t>(m) & lastBitsMask);
376 uint64_t half = (1ull << (bitDiff - 1)) - 1;
377 uint64_t bias = (m >> bitDiff) & 1;
378
379 switch (rd)
380 {
381 case ROUND_TO_EVEN:
382 return Float(StorageType(s | (m + half + bias) >> bitDiff));
383
384 case ROUND_DOWNWARD:
385 m = (m >> bitDiff);
386 if (lastBits != 0ull && other.sign() < 0)
387 {
388 m += 1;
389 }
390 return Float(StorageType(s | m));
391
392 case ROUND_UPWARD:
393 m = (m >> bitDiff);
394 if (lastBits != 0ull && other.sign() > 0)
395 {
396 m += 1;
397 }
398 return Float(StorageType(s | m));
399
400 case ROUND_TO_ZERO:
401 return Float(StorageType(s | (m >> bitDiff)));
402
403 default:
404 DE_ASSERT(false);
405 break;
406 }
407 }
408
409 return zero(other.sign());
410 }
411
412 // Remove leading 1.
413 m = m & ~(1ull << OtherMantissaBits);
414
415 if (MantissaBits < OtherMantissaBits)
416 {
417 // Round mantissa.
418 int bitDiff = OtherMantissaBits - MantissaBits;
419 uint64_t lastBitsMask = (1ull << bitDiff) - 1ull;
420 uint64_t lastBits = (static_cast<uint64_t>(m) & lastBitsMask);
421 uint64_t half = (1ull << (bitDiff - 1)) - 1;
422 uint64_t bias = (m >> bitDiff) & 1;
423
424 switch (rd)
425 {
426 case ROUND_TO_EVEN:
427 m = (m + half + bias) >> bitDiff;
428 break;
429
430 case ROUND_DOWNWARD:
431 m = (m >> bitDiff);
432 if (lastBits != 0ull && other.sign() < 0)
433 {
434 m += 1;
435 }
436 break;
437
438 case ROUND_UPWARD:
439 m = (m >> bitDiff);
440 if (lastBits != 0ull && other.sign() > 0)
441 {
442 m += 1;
443 }
444 break;
445
446 case ROUND_TO_ZERO:
447 m = (m >> bitDiff);
448 break;
449
450 default:
451 DE_ASSERT(false);
452 break;
453 }
454
455 if (m & (1ull << MantissaBits))
456 {
457 // Overflow in mantissa.
458 m = 0;
459 e += 1;
460 }
461 }
462 else
463 {
464 int bitDiff = MantissaBits - OtherMantissaBits;
465 m = m << bitDiff;
466 }
467
468 if (e > eMax)
469 {
470 // Overflow.
471 return (((other.sign() < 0 && rd == ROUND_UPWARD) || (other.sign() > 0 && rd == ROUND_DOWNWARD)) ?
472 largestNormal(other.sign()) :
473 inf(other.sign()));
474 }
475
476 DE_ASSERT(de::inRange(e, eMin, eMax));
477 DE_ASSERT(((e + ExponentBias) & ~((1ull << ExponentBits) - 1)) == 0);
478 DE_ASSERT((m & ~((1ull << MantissaBits) - 1)) == 0);
479
480 return Float(StorageType(s | (StorageType(e + ExponentBias) << MantissaBits) | m));
481 }
482
483 typedef typename Float16::StorageType float16_t;
484 template <class F>
485 inline constexpr F floatQuietNaN = std::numeric_limits<F>::quiet_NaN();
486 template <>
487 inline constexpr float16_t floatQuietNaN<float16_t> = 0x7e01;
488 template <class F>
489 inline constexpr F floatSignalingNaN = std::numeric_limits<F>::signaling_NaN();
490 template <>
491 inline constexpr float16_t floatSignalingNaN<float16_t> = 0x7c01;
492
493 } // namespace tcu
494
495 #endif // _TCUFLOAT_HPP
496