1 // Copyright 2019 The Amber Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/float16_helper.h"
16
17 #include <cassert>
18 #include <cstring>
19
20 // Float10
21 // | 9 8 7 6 5 | 4 3 2 1 0 |
22 // | exponent | mantissa |
23 //
24 // Float11
25 // | 10 9 8 7 6 | 5 4 3 2 1 0 |
26 // | exponent | mantissa |
27 //
28 // Float16
29 // | 15 | 14 13 12 11 10 | 9 8 7 6 5 4 3 2 1 0 |
30 // | s | exponent | mantissa |
31 //
32 // Float32
33 // | 31 | 30 ... 23 | 22 ... 0 |
34 // | s | exponent | mantissa |
35
36 namespace amber {
37 namespace float16 {
38 namespace {
39
40 // Return sign value of 32 bits float.
FloatSign(const uint32_t hex_float)41 uint16_t FloatSign(const uint32_t hex_float) {
42 return static_cast<uint16_t>(hex_float >> 31U);
43 }
44
45 // Return exponent value of 32 bits float.
FloatExponent(const uint32_t hex_float)46 uint16_t FloatExponent(const uint32_t hex_float) {
47 uint32_t exponent_bits = ((hex_float >> 23U) & ((1U << 8U) - 1U));
48 // Handle zero and denormals.
49 if (exponent_bits == 0U)
50 return 0;
51 uint32_t exponent = exponent_bits - 112U;
52 const uint32_t half_exponent_mask = (1U << 5U) - 1U;
53 assert(((exponent & ~half_exponent_mask) == 0U) && "Float exponent overflow");
54 return static_cast<uint16_t>(exponent & half_exponent_mask);
55 }
56
57 // Return mantissa value of 32 bits float. Note that mantissa for 32
58 // bits float is 23 bits and this method must return uint32_t.
FloatMantissa(const uint32_t hex_float)59 uint32_t FloatMantissa(const uint32_t hex_float) {
60 return static_cast<uint32_t>(hex_float & ((1U << 23U) - 1U));
61 }
62
63 // Convert float |value| whose size is 16 bits to 32 bits float
64 // based on IEEE-754.
HexFloat16ToFloat(const uint8_t * value)65 float HexFloat16ToFloat(const uint8_t* value) {
66 uint32_t sign = (static_cast<uint32_t>(value[1]) & 0x80) << 24U;
67 uint32_t exponent_bits = (static_cast<uint32_t>(value[1]) & 0x7c) >> 2U;
68 uint32_t exponent = 0U;
69 uint32_t mantissa = 0U;
70 // Handle zero and flush denormals to zero.
71 if (exponent_bits != 0U) {
72 exponent = (exponent_bits + 112U) << 23U;
73 mantissa = ((static_cast<uint32_t>(value[1]) & 0x3) << 8U |
74 static_cast<uint32_t>(value[0]))
75 << 13U;
76 }
77
78 uint32_t hex = sign | exponent | mantissa;
79 float hex_float;
80 static_assert((sizeof(uint32_t) == sizeof(float)),
81 "sizeof(uint32_t) != sizeof(float)");
82 memcpy(&hex_float, &hex, sizeof(float));
83 return hex_float;
84 }
85
86 // Convert float |value| whose size is 11 bits to 32 bits float
87 // based on IEEE-754.
HexFloat11ToFloat(const uint8_t * value)88 float HexFloat11ToFloat(const uint8_t* value) {
89 uint32_t exponent = (((static_cast<uint32_t>(value[1]) << 2U) |
90 ((static_cast<uint32_t>(value[0]) & 0xc0) >> 6U)) +
91 112U)
92 << 23U;
93 uint32_t mantissa = (static_cast<uint32_t>(value[0]) & 0x3f) << 17U;
94
95 uint32_t hex = exponent | mantissa;
96 float hex_float;
97 static_assert((sizeof(uint32_t) == sizeof(float)),
98 "sizeof(uint32_t) != sizeof(float)");
99 memcpy(&hex_float, &hex, sizeof(float));
100 return hex_float;
101 }
102
103 // Convert float |value| whose size is 10 bits to 32 bits float
104 // based on IEEE-754.
HexFloat10ToFloat(const uint8_t * value)105 float HexFloat10ToFloat(const uint8_t* value) {
106 uint32_t exponent = (((static_cast<uint32_t>(value[1]) << 3U) |
107 ((static_cast<uint32_t>(value[0]) & 0xe0) >> 5U)) +
108 112U)
109 << 23U;
110 uint32_t mantissa = (static_cast<uint32_t>(value[0]) & 0x1f) << 18U;
111
112 uint32_t hex = exponent | mantissa;
113 float hex_float;
114 static_assert((sizeof(uint32_t) == sizeof(float)),
115 "sizeof(uint32_t) != sizeof(float)");
116 memcpy(&hex_float, &hex, sizeof(float));
117 return hex_float;
118 }
119
120 } // namespace
121
HexFloatToFloat(const uint8_t * value,uint8_t bits)122 float HexFloatToFloat(const uint8_t* value, uint8_t bits) {
123 switch (bits) {
124 case 10:
125 return HexFloat10ToFloat(value);
126 case 11:
127 return HexFloat11ToFloat(value);
128 case 16:
129 return HexFloat16ToFloat(value);
130 }
131
132 assert(false && "Invalid bits");
133 return 0;
134 }
135
FloatToHexFloat16(const float value)136 uint16_t FloatToHexFloat16(const float value) {
137 const uint32_t* hex = reinterpret_cast<const uint32_t*>(&value);
138 uint16_t sign = FloatSign(*hex);
139 uint16_t exponent = FloatExponent(*hex);
140 // Flush denormals.
141 uint32_t mantissa = ((exponent == 0) ? 0U : FloatMantissa(*hex));
142 return static_cast<uint16_t>(static_cast<uint16_t>(sign << 15U) |
143 static_cast<uint16_t>(exponent << 10U) |
144 static_cast<uint16_t>(mantissa >> 13U));
145 }
146
147 } // namespace float16
148 } // namespace amber
149